517 lines
12 KiB
C
517 lines
12 KiB
C
// SPDX-License-Identifier: GPL-2.0
|
|
|
|
/*
|
|
* Copyright (c) 2025, Google LLC.
|
|
* Pasha Tatashin <pasha.tatashin@soleen.com>
|
|
*
|
|
* Copyright (C) 2025 Amazon.com Inc. or its affiliates.
|
|
* Pratyush Yadav <ptyadav@amazon.de>
|
|
*/
|
|
|
|
/**
|
|
* DOC: Memfd Preservation via LUO
|
|
*
|
|
* Overview
|
|
* ========
|
|
*
|
|
* Memory file descriptors (memfd) can be preserved over a kexec using the Live
|
|
* Update Orchestrator (LUO) file preservation. This allows userspace to
|
|
* transfer its memory contents to the next kernel after a kexec.
|
|
*
|
|
* The preservation is not intended to be transparent. Only select properties of
|
|
* the file are preserved. All others are reset to default. The preserved
|
|
* properties are described below.
|
|
*
|
|
* .. note::
|
|
* The LUO API is not stabilized yet, so the preserved properties of a memfd
|
|
* are also not stable and are subject to backwards incompatible changes.
|
|
*
|
|
* .. note::
|
|
* Currently a memfd backed by Hugetlb is not supported. Memfds created
|
|
* with ``MFD_HUGETLB`` will be rejected.
|
|
*
|
|
* Preserved Properties
|
|
* ====================
|
|
*
|
|
* The following properties of the memfd are preserved across kexec:
|
|
*
|
|
* File Contents
|
|
* All data stored in the file is preserved.
|
|
*
|
|
* File Size
|
|
* The size of the file is preserved. Holes in the file are filled by
|
|
* allocating pages for them during preservation.
|
|
*
|
|
* File Position
|
|
* The current file position is preserved, allowing applications to continue
|
|
* reading/writing from their last position.
|
|
*
|
|
* File Status Flags
|
|
* memfds are always opened with ``O_RDWR`` and ``O_LARGEFILE``. This property
|
|
* is maintained.
|
|
*
|
|
* Non-Preserved Properties
|
|
* ========================
|
|
*
|
|
* All properties which are not preserved must be assumed to be reset to
|
|
* default. This section describes some of those properties which may be more of
|
|
* note.
|
|
*
|
|
* ``FD_CLOEXEC`` flag
|
|
* A memfd can be created with the ``MFD_CLOEXEC`` flag that sets the
|
|
* ``FD_CLOEXEC`` on the file. This flag is not preserved and must be set
|
|
* again after restore via ``fcntl()``.
|
|
*
|
|
* Seals
|
|
* File seals are not preserved. The file is unsealed on restore and if
|
|
* needed, must be sealed again via ``fcntl()``.
|
|
*/
|
|
|
|
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
|
|
|
|
#include <linux/bits.h>
|
|
#include <linux/err.h>
|
|
#include <linux/file.h>
|
|
#include <linux/io.h>
|
|
#include <linux/kexec_handover.h>
|
|
#include <linux/kho/abi/memfd.h>
|
|
#include <linux/liveupdate.h>
|
|
#include <linux/shmem_fs.h>
|
|
#include <linux/vmalloc.h>
|
|
#include "internal.h"
|
|
|
|
static int memfd_luo_preserve_folios(struct file *file,
|
|
struct kho_vmalloc *kho_vmalloc,
|
|
struct memfd_luo_folio_ser **out_folios_ser,
|
|
u64 *nr_foliosp)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
unsigned int max_folios;
|
|
long i, size, nr_pinned;
|
|
struct folio **folios;
|
|
int err = -EINVAL;
|
|
pgoff_t offset;
|
|
u64 nr_folios;
|
|
|
|
size = i_size_read(inode);
|
|
/*
|
|
* If the file has zero size, then the folios and nr_folios properties
|
|
* are not set.
|
|
*/
|
|
if (!size) {
|
|
*nr_foliosp = 0;
|
|
*out_folios_ser = NULL;
|
|
memset(kho_vmalloc, 0, sizeof(*kho_vmalloc));
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Guess the number of folios based on inode size. Real number might end
|
|
* up being smaller if there are higher order folios.
|
|
*/
|
|
max_folios = PAGE_ALIGN(size) / PAGE_SIZE;
|
|
folios = kvmalloc_array(max_folios, sizeof(*folios), GFP_KERNEL);
|
|
if (!folios)
|
|
return -ENOMEM;
|
|
|
|
/*
|
|
* Pin the folios so they don't move around behind our back. This also
|
|
* ensures none of the folios are in CMA -- which ensures they don't
|
|
* fall in KHO scratch memory. It also moves swapped out folios back to
|
|
* memory.
|
|
*
|
|
* A side effect of doing this is that it allocates a folio for all
|
|
* indices in the file. This might waste memory on sparse memfds. If
|
|
* that is really a problem in the future, we can have a
|
|
* memfd_pin_folios() variant that does not allocate a page on empty
|
|
* slots.
|
|
*/
|
|
nr_pinned = memfd_pin_folios(file, 0, size - 1, folios, max_folios,
|
|
&offset);
|
|
if (nr_pinned < 0) {
|
|
err = nr_pinned;
|
|
pr_err("failed to pin folios: %d\n", err);
|
|
goto err_free_folios;
|
|
}
|
|
nr_folios = nr_pinned;
|
|
|
|
folios_ser = vcalloc(nr_folios, sizeof(*folios_ser));
|
|
if (!folios_ser) {
|
|
err = -ENOMEM;
|
|
goto err_unpin;
|
|
}
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
struct folio *folio = folios[i];
|
|
unsigned int flags = 0;
|
|
|
|
err = kho_preserve_folio(folio);
|
|
if (err)
|
|
goto err_unpreserve;
|
|
|
|
if (folio_test_dirty(folio))
|
|
flags |= MEMFD_LUO_FOLIO_DIRTY;
|
|
if (folio_test_uptodate(folio))
|
|
flags |= MEMFD_LUO_FOLIO_UPTODATE;
|
|
|
|
pfolio->pfn = folio_pfn(folio);
|
|
pfolio->flags = flags;
|
|
pfolio->index = folio->index;
|
|
}
|
|
|
|
err = kho_preserve_vmalloc(folios_ser, kho_vmalloc);
|
|
if (err)
|
|
goto err_unpreserve;
|
|
|
|
kvfree(folios);
|
|
*nr_foliosp = nr_folios;
|
|
*out_folios_ser = folios_ser;
|
|
|
|
/*
|
|
* Note: folios_ser is purposely not freed here. It is preserved
|
|
* memory (via KHO). In the 'unpreserve' path, we use the vmap pointer
|
|
* that is passed via private_data.
|
|
*/
|
|
return 0;
|
|
|
|
err_unpreserve:
|
|
for (i = i - 1; i >= 0; i--)
|
|
kho_unpreserve_folio(folios[i]);
|
|
vfree(folios_ser);
|
|
err_unpin:
|
|
unpin_folios(folios, nr_folios);
|
|
err_free_folios:
|
|
kvfree(folios);
|
|
|
|
return err;
|
|
}
|
|
|
|
static void memfd_luo_unpreserve_folios(struct kho_vmalloc *kho_vmalloc,
|
|
struct memfd_luo_folio_ser *folios_ser,
|
|
u64 nr_folios)
|
|
{
|
|
long i;
|
|
|
|
if (!nr_folios)
|
|
return;
|
|
|
|
kho_unpreserve_vmalloc(kho_vmalloc);
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
struct folio *folio;
|
|
|
|
if (!pfolio->pfn)
|
|
continue;
|
|
|
|
folio = pfn_folio(pfolio->pfn);
|
|
|
|
kho_unpreserve_folio(folio);
|
|
unpin_folio(folio);
|
|
}
|
|
|
|
vfree(folios_ser);
|
|
}
|
|
|
|
static int memfd_luo_preserve(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct inode *inode = file_inode(args->file);
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
struct memfd_luo_ser *ser;
|
|
u64 nr_folios;
|
|
int err = 0;
|
|
|
|
inode_lock(inode);
|
|
shmem_freeze(inode, true);
|
|
|
|
/* Allocate the main serialization structure in preserved memory */
|
|
ser = kho_alloc_preserve(sizeof(*ser));
|
|
if (IS_ERR(ser)) {
|
|
err = PTR_ERR(ser);
|
|
goto err_unlock;
|
|
}
|
|
|
|
ser->pos = args->file->f_pos;
|
|
ser->size = i_size_read(inode);
|
|
|
|
err = memfd_luo_preserve_folios(args->file, &ser->folios,
|
|
&folios_ser, &nr_folios);
|
|
if (err)
|
|
goto err_free_ser;
|
|
|
|
ser->nr_folios = nr_folios;
|
|
inode_unlock(inode);
|
|
|
|
args->private_data = folios_ser;
|
|
args->serialized_data = virt_to_phys(ser);
|
|
|
|
return 0;
|
|
|
|
err_free_ser:
|
|
kho_unpreserve_free(ser);
|
|
err_unlock:
|
|
shmem_freeze(inode, false);
|
|
inode_unlock(inode);
|
|
return err;
|
|
}
|
|
|
|
static int memfd_luo_freeze(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct memfd_luo_ser *ser;
|
|
|
|
if (WARN_ON_ONCE(!args->serialized_data))
|
|
return -EINVAL;
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
|
|
/*
|
|
* The pos might have changed since prepare. Everything else stays the
|
|
* same.
|
|
*/
|
|
ser->pos = args->file->f_pos;
|
|
|
|
return 0;
|
|
}
|
|
|
|
static void memfd_luo_unpreserve(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct inode *inode = file_inode(args->file);
|
|
struct memfd_luo_ser *ser;
|
|
|
|
if (WARN_ON_ONCE(!args->serialized_data))
|
|
return;
|
|
|
|
inode_lock(inode);
|
|
shmem_freeze(inode, false);
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
|
|
memfd_luo_unpreserve_folios(&ser->folios, args->private_data,
|
|
ser->nr_folios);
|
|
|
|
kho_unpreserve_free(ser);
|
|
inode_unlock(inode);
|
|
}
|
|
|
|
static void memfd_luo_discard_folios(const struct memfd_luo_folio_ser *folios_ser,
|
|
u64 nr_folios)
|
|
{
|
|
u64 i;
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
struct folio *folio;
|
|
phys_addr_t phys;
|
|
|
|
if (!pfolio->pfn)
|
|
continue;
|
|
|
|
phys = PFN_PHYS(pfolio->pfn);
|
|
folio = kho_restore_folio(phys);
|
|
if (!folio) {
|
|
pr_warn_ratelimited("Unable to restore folio at physical address: %llx\n",
|
|
phys);
|
|
continue;
|
|
}
|
|
|
|
folio_put(folio);
|
|
}
|
|
}
|
|
|
|
static void memfd_luo_finish(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
struct memfd_luo_ser *ser;
|
|
|
|
if (args->retrieved)
|
|
return;
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
if (!ser)
|
|
return;
|
|
|
|
if (ser->nr_folios) {
|
|
folios_ser = kho_restore_vmalloc(&ser->folios);
|
|
if (!folios_ser)
|
|
goto out;
|
|
|
|
memfd_luo_discard_folios(folios_ser, ser->nr_folios);
|
|
vfree(folios_ser);
|
|
}
|
|
|
|
out:
|
|
kho_restore_free(ser);
|
|
}
|
|
|
|
static int memfd_luo_retrieve_folios(struct file *file,
|
|
struct memfd_luo_folio_ser *folios_ser,
|
|
u64 nr_folios)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
struct address_space *mapping = inode->i_mapping;
|
|
struct folio *folio;
|
|
int err = -EIO;
|
|
long i;
|
|
|
|
for (i = 0; i < nr_folios; i++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[i];
|
|
phys_addr_t phys;
|
|
u64 index;
|
|
int flags;
|
|
|
|
if (!pfolio->pfn)
|
|
continue;
|
|
|
|
phys = PFN_PHYS(pfolio->pfn);
|
|
folio = kho_restore_folio(phys);
|
|
if (!folio) {
|
|
pr_err("Unable to restore folio at physical address: %llx\n",
|
|
phys);
|
|
goto put_folios;
|
|
}
|
|
index = pfolio->index;
|
|
flags = pfolio->flags;
|
|
|
|
/* Set up the folio for insertion. */
|
|
__folio_set_locked(folio);
|
|
__folio_set_swapbacked(folio);
|
|
|
|
err = mem_cgroup_charge(folio, NULL, mapping_gfp_mask(mapping));
|
|
if (err) {
|
|
pr_err("shmem: failed to charge folio index %ld: %d\n",
|
|
i, err);
|
|
goto unlock_folio;
|
|
}
|
|
|
|
err = shmem_add_to_page_cache(folio, mapping, index, NULL,
|
|
mapping_gfp_mask(mapping));
|
|
if (err) {
|
|
pr_err("shmem: failed to add to page cache folio index %ld: %d\n",
|
|
i, err);
|
|
goto unlock_folio;
|
|
}
|
|
|
|
if (flags & MEMFD_LUO_FOLIO_UPTODATE)
|
|
folio_mark_uptodate(folio);
|
|
if (flags & MEMFD_LUO_FOLIO_DIRTY)
|
|
folio_mark_dirty(folio);
|
|
|
|
err = shmem_inode_acct_blocks(inode, 1);
|
|
if (err) {
|
|
pr_err("shmem: failed to account folio index %ld: %d\n",
|
|
i, err);
|
|
goto unlock_folio;
|
|
}
|
|
|
|
shmem_recalc_inode(inode, 1, 0);
|
|
folio_add_lru(folio);
|
|
folio_unlock(folio);
|
|
folio_put(folio);
|
|
}
|
|
|
|
return 0;
|
|
|
|
unlock_folio:
|
|
folio_unlock(folio);
|
|
folio_put(folio);
|
|
put_folios:
|
|
/*
|
|
* Note: don't free the folios already added to the file. They will be
|
|
* freed when the file is freed. Free the ones not added yet here.
|
|
*/
|
|
for (long j = i + 1; j < nr_folios; j++) {
|
|
const struct memfd_luo_folio_ser *pfolio = &folios_ser[j];
|
|
|
|
folio = kho_restore_folio(pfolio->pfn);
|
|
if (folio)
|
|
folio_put(folio);
|
|
}
|
|
|
|
return err;
|
|
}
|
|
|
|
static int memfd_luo_retrieve(struct liveupdate_file_op_args *args)
|
|
{
|
|
struct memfd_luo_folio_ser *folios_ser;
|
|
struct memfd_luo_ser *ser;
|
|
struct file *file;
|
|
int err;
|
|
|
|
ser = phys_to_virt(args->serialized_data);
|
|
if (!ser)
|
|
return -EINVAL;
|
|
|
|
file = shmem_file_setup("", 0, VM_NORESERVE);
|
|
|
|
if (IS_ERR(file)) {
|
|
pr_err("failed to setup file: %pe\n", file);
|
|
return PTR_ERR(file);
|
|
}
|
|
|
|
vfs_setpos(file, ser->pos, MAX_LFS_FILESIZE);
|
|
file->f_inode->i_size = ser->size;
|
|
|
|
if (ser->nr_folios) {
|
|
folios_ser = kho_restore_vmalloc(&ser->folios);
|
|
if (!folios_ser) {
|
|
err = -EINVAL;
|
|
goto put_file;
|
|
}
|
|
|
|
err = memfd_luo_retrieve_folios(file, folios_ser, ser->nr_folios);
|
|
vfree(folios_ser);
|
|
if (err)
|
|
goto put_file;
|
|
}
|
|
|
|
args->file = file;
|
|
kho_restore_free(ser);
|
|
|
|
return 0;
|
|
|
|
put_file:
|
|
fput(file);
|
|
|
|
return err;
|
|
}
|
|
|
|
static bool memfd_luo_can_preserve(struct liveupdate_file_handler *handler,
|
|
struct file *file)
|
|
{
|
|
struct inode *inode = file_inode(file);
|
|
|
|
return shmem_file(file) && !inode->i_nlink;
|
|
}
|
|
|
|
static const struct liveupdate_file_ops memfd_luo_file_ops = {
|
|
.freeze = memfd_luo_freeze,
|
|
.finish = memfd_luo_finish,
|
|
.retrieve = memfd_luo_retrieve,
|
|
.preserve = memfd_luo_preserve,
|
|
.unpreserve = memfd_luo_unpreserve,
|
|
.can_preserve = memfd_luo_can_preserve,
|
|
.owner = THIS_MODULE,
|
|
};
|
|
|
|
static struct liveupdate_file_handler memfd_luo_handler = {
|
|
.ops = &memfd_luo_file_ops,
|
|
.compatible = MEMFD_LUO_FH_COMPATIBLE,
|
|
};
|
|
|
|
static int __init memfd_luo_init(void)
|
|
{
|
|
int err = liveupdate_register_file_handler(&memfd_luo_handler);
|
|
|
|
if (err && err != -EOPNOTSUPP) {
|
|
pr_err("Could not register luo filesystem handler: %pe\n",
|
|
ERR_PTR(err));
|
|
|
|
return err;
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
late_initcall(memfd_luo_init);
|