diff --git a/fs/Makefile b/fs/Makefile index a04274a3c854..becf133e4791 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -16,7 +16,7 @@ obj-y := open.o read_write.o file_table.o super.o \ stack.o fs_struct.o statfs.o fs_pin.o nsfs.o \ fs_dirent.o fs_context.o fs_parser.o fsopen.o init.o \ kernel_read_file.o mnt_idmapping.o remap_range.o pidfs.o \ - file_attr.o + file_attr.o nullfs.o obj-$(CONFIG_BUFFER_HEAD) += buffer.o mpage.o obj-$(CONFIG_PROC_FS) += proc_namespace.o diff --git a/fs/mount.h b/fs/mount.h index 2d28ef2a3aed..e0816c11a198 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,7 @@ #include #include +extern struct file_system_type nullfs_fs_type; extern struct list_head notify_list; struct mnt_namespace { diff --git a/fs/namespace.c b/fs/namespace.c index 9261f56ccc81..a44ebb2f1161 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -75,6 +75,17 @@ static int __init initramfs_options_setup(char *str) __setup("initramfs_options=", initramfs_options_setup); +bool nullfs_rootfs = false; + +static int __init nullfs_rootfs_setup(char *str) +{ + if (*str) + return 0; + nullfs_rootfs = true; + return 1; +} +__setup("nullfs_rootfs", nullfs_rootfs_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -4582,8 +4593,9 @@ int path_pivot_root(struct path *new, struct path *old) * pointed to by put_old must yield the same directory as new_root. No other * file system may be mounted on put_old. After all, new_root is a mountpoint. * - * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem. - * See Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives + * Also, the current root cannot be on the 'rootfs' (initial ramfs) filesystem + * unless the kernel was booted with "nullfs_rootfs". See + * Documentation/filesystems/ramfs-rootfs-initramfs.rst for alternatives * in this situation. * * Notes: @@ -5976,24 +5988,72 @@ struct mnt_namespace init_mnt_ns = { static void __init init_mount_tree(void) { - struct vfsmount *mnt; - struct mount *m; + struct vfsmount *mnt, *nullfs_mnt; + struct mount *mnt_root; struct path root; + /* + * When nullfs is used, we create two mounts: + * + * (1) nullfs with mount id 1 + * (2) mutable rootfs with mount id 2 + * + * with (2) mounted on top of (1). + */ + if (nullfs_rootfs) { + nullfs_mnt = vfs_kern_mount(&nullfs_fs_type, 0, "nullfs", NULL); + if (IS_ERR(nullfs_mnt)) + panic("VFS: Failed to create nullfs"); + } + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); - m = real_mount(mnt); - init_mnt_ns.root = m; - init_mnt_ns.nr_mounts = 1; - mnt_add_to_ns(&init_mnt_ns, m); + if (nullfs_rootfs) { + VFS_WARN_ON_ONCE(real_mount(nullfs_mnt)->mnt_id != 1); + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 2); + + /* The namespace root is the nullfs mnt. */ + mnt_root = real_mount(nullfs_mnt); + init_mnt_ns.root = mnt_root; + + /* Mount mutable rootfs on top of nullfs. */ + root.mnt = nullfs_mnt; + root.dentry = nullfs_mnt->mnt_root; + + LOCK_MOUNT_EXACT(mp, &root); + if (unlikely(IS_ERR(mp.parent))) + panic("VFS: Failed to mount rootfs on nullfs"); + scoped_guard(mount_writer) + attach_mnt(real_mount(mnt), mp.parent, mp.mp); + + pr_info("VFS: Finished mounting rootfs on nullfs\n"); + } else { + VFS_WARN_ON_ONCE(real_mount(mnt)->mnt_id != 1); + + /* The namespace root is the mutable rootfs. */ + mnt_root = real_mount(mnt); + init_mnt_ns.root = mnt_root; + } + + /* + * We've dropped all locks here but that's fine. Not just are we + * the only task that's running, there's no other mount + * namespace in existence and the initial mount namespace is + * completely empty until we add the mounts we just created. + */ + for (struct mount *p = mnt_root; p; p = next_mnt(p, mnt_root)) { + mnt_add_to_ns(&init_mnt_ns, p); + init_mnt_ns.nr_mounts++; + } + init_task.nsproxy->mnt_ns = &init_mnt_ns; get_mnt_ns(&init_mnt_ns); - root.mnt = mnt; - root.dentry = mnt->mnt_root; - + /* The root and pwd always point to the mutable rootfs. */ + root.mnt = mnt; + root.dentry = mnt->mnt_root; set_fs_pwd(current->fs, &root); set_fs_root(current->fs, &root); diff --git a/fs/nullfs.c b/fs/nullfs.c new file mode 100644 index 000000000000..fdbd3e5d3d71 --- /dev/null +++ b/fs/nullfs.c @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2026 Christian Brauner */ +#include +#include +#include + +static const struct super_operations nullfs_super_operations = { + .statfs = simple_statfs, +}; + +static int nullfs_fs_fill_super(struct super_block *s, struct fs_context *fc) +{ + struct inode *inode; + + s->s_maxbytes = MAX_LFS_FILESIZE; + s->s_blocksize = PAGE_SIZE; + s->s_blocksize_bits = PAGE_SHIFT; + s->s_magic = NULL_FS_MAGIC; + s->s_op = &nullfs_super_operations; + s->s_export_op = NULL; + s->s_xattr = NULL; + s->s_time_gran = 1; + s->s_d_flags = 0; + + inode = new_inode(s); + if (!inode) + return -ENOMEM; + + /* nullfs is permanently empty... */ + make_empty_dir_inode(inode); + simple_inode_init_ts(inode); + inode->i_ino = 1; + /* ... and immutable. */ + inode->i_flags |= S_IMMUTABLE; + + s->s_root = d_make_root(inode); + if (!s->s_root) + return -ENOMEM; + + return 0; +} + +/* + * For now this is a single global instance. If needed we can make it + * mountable by userspace at which point we will need to make it + * multi-instance. + */ +static int nullfs_fs_get_tree(struct fs_context *fc) +{ + return get_tree_single(fc, nullfs_fs_fill_super); +} + +static const struct fs_context_operations nullfs_fs_context_ops = { + .get_tree = nullfs_fs_get_tree, +}; + +static int nullfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &nullfs_fs_context_ops; + fc->global = true; + fc->sb_flags = SB_NOUSER; + fc->s_iflags = SB_I_NOEXEC | SB_I_NODEV; + return 0; +} + +struct file_system_type nullfs_fs_type = { + .name = "nullfs", + .init_fs_context = nullfs_init_fs_context, + .kill_sb = kill_anon_super, +}; diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h index 638ca21b7a90..4f2da935a76c 100644 --- a/include/uapi/linux/magic.h +++ b/include/uapi/linux/magic.h @@ -104,5 +104,6 @@ #define SECRETMEM_MAGIC 0x5345434d /* "SECM" */ #define PID_FS_MAGIC 0x50494446 /* "PIDF" */ #define GUEST_MEMFD_MAGIC 0x474d454d /* "GMEM" */ +#define NULL_FS_MAGIC 0x4E554C4C /* "NULL" */ #endif /* __LINUX_MAGIC_H__ */ diff --git a/init/do_mounts.c b/init/do_mounts.c index defbbf1d55f7..675397c8a7a4 100644 --- a/init/do_mounts.c +++ b/init/do_mounts.c @@ -492,6 +492,20 @@ void __init prepare_namespace(void) mount_root(saved_root_name); out: devtmpfs_mount(); + + if (nullfs_rootfs) { + if (init_pivot_root(".", ".")) { + pr_err("VFS: Failed to pivot into new rootfs\n"); + return; + } + if (init_umount(".", MNT_DETACH)) { + pr_err("VFS: Failed to unmount old rootfs\n"); + return; + } + pr_info("VFS: Pivoted into new rootfs\n"); + return; + } + init_mount(".", "/", NULL, MS_MOVE, NULL); init_chroot("."); } diff --git a/init/do_mounts.h b/init/do_mounts.h index 6069ea3eb80d..fbfee810aa89 100644 --- a/init/do_mounts.h +++ b/init/do_mounts.h @@ -15,6 +15,7 @@ void mount_root_generic(char *name, char *pretty_name, int flags); void mount_root(char *root_device_name); extern int root_mountflags; +extern bool nullfs_rootfs; static inline __init int create_dev(char *name, dev_t dev) {