From e462fc48ceb8224811c3224650afed05cb7f0872 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Fri, 3 Oct 2025 15:43:08 -0700 Subject: [PATCH 01/39] f2fs: maintain one time GC mode is enabled during whole zoned GC cycle The current version missed setting one time GC for normal zoned GC cycle. So, valid threshold control is not working. Need to fix it to prevent excessive GC for zoned devices. Fixes: e791d00bd06c ("f2fs: add valid block ratio not to do excessive GC for one time GC") Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a7708cf80c04..8abf521530ff 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -38,13 +38,14 @@ static int gc_thread_func(void *data) struct f2fs_gc_control gc_control = { .victim_segno = NULL_SEGNO, .should_migrate_blocks = false, - .err_gc_skipped = false }; + .err_gc_skipped = false, + .one_time = false }; wait_ms = gc_th->min_sleep_time; set_freezable(); do { - bool sync_mode, foreground = false; + bool sync_mode, foreground = false, gc_boost = false; wait_event_freezable_timeout(*wq, kthread_should_stop() || @@ -52,8 +53,12 @@ static int gc_thread_func(void *data) gc_th->gc_wake, msecs_to_jiffies(wait_ms)); - if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) { foreground = true; + gc_control.one_time = false; + } else if (f2fs_sb_has_blkzoned(sbi)) { + gc_control.one_time = true; + } /* give it a try one time */ if (gc_th->gc_wake) @@ -81,8 +86,6 @@ static int gc_thread_func(void *data) continue; } - gc_control.one_time = false; - /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -132,7 +135,7 @@ static int gc_thread_func(void *data) if (need_to_boost_gc(sbi)) { decrease_sleep_time(gc_th, &wait_ms); if (f2fs_sb_has_blkzoned(sbi)) - gc_control.one_time = true; + gc_boost = true; } else { increase_sleep_time(gc_th, &wait_ms); } @@ -141,7 +144,7 @@ do_gc: FOREGROUND : BACKGROUND); sync_mode = (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC) || - (gc_control.one_time && gc_th->boost_gc_greedy); + (gc_boost && gc_th->boost_gc_greedy); /* foreground GC was been triggered via f2fs_balance_fs() */ if (foreground && !f2fs_sb_has_blkzoned(sbi)) From 81464df36094340395cadc9235e24eb4defa8c43 Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 7 Oct 2025 09:46:14 -0700 Subject: [PATCH 02/39] f2fs: set default valid_thresh_ratio to 80 for zoned devices Zoned storage devices provide marginal over-capacity space, typically around 10%, for filesystem level storage control. By utilizing this extra capacity, we can safely reduce the default 'valid_thresh_ratio' to 80. This action helps to significantly prevent excessive garbage collection (GC) and the resulting power consumption, as the filesystem becomes less aggressive about cleaning segments that still hold a high percentage of valid data. Reviewed-by: Chao Yu Signed-off-by: Daeho Jeong Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 24e8b1c27acc..6c4d4567571e 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -25,7 +25,7 @@ #define DEF_GC_THREAD_CANDIDATE_RATIO 20 /* select 20% oldest sections as candidates */ #define DEF_GC_THREAD_MAX_CANDIDATE_COUNT 10 /* select at most 10 sections as candidates */ #define DEF_GC_THREAD_AGE_WEIGHT 60 /* age weight */ -#define DEF_GC_THREAD_VALID_THRESH_RATIO 95 /* do not GC over 95% valid block ratio for one time GC */ +#define DEF_GC_THREAD_VALID_THRESH_RATIO 80 /* do not GC over 80% valid block ratio for one time GC */ #define DEFAULT_ACCURACY_CLASS 10000 /* accuracy class */ #define LIMIT_INVALID_BLOCK 40 /* percentage over total user space */ From e4384545e22024d39edc13c63433f37e31960671 Mon Sep 17 00:00:00 2001 From: Pedro Demarchi Gomes Date: Sat, 4 Oct 2025 00:12:17 -0300 Subject: [PATCH 03/39] f2fs: use folio_nr_pages() instead of shift operation folio_nr_pages() is a faster helper function to get the number of pages when NR_PAGES_IN_LARGE_FOLIO is enabled. Signed-off-by: Pedro Demarchi Gomes Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5b4e9548a231..b6e35fdd5fd3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2096,7 +2096,7 @@ static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) static inline struct f2fs_super_block *F2FS_SUPER_BLOCK(struct folio *folio, pgoff_t index) { - pgoff_t idx_in_folio = index % (1 << folio_order(folio)); + pgoff_t idx_in_folio = index % folio_nr_pages(folio); return (struct f2fs_super_block *) (page_address(folio_page(folio, idx_in_folio)) + From 2308de27c03d8ed75b28dab2354eb02c5f8e69be Mon Sep 17 00:00:00 2001 From: Jiucheng Xu Date: Fri, 10 Oct 2025 10:45:50 +0000 Subject: [PATCH 04/39] f2fs: Use mapping->gfp_mask to get file cache for writing On 32-bit architectures, when GFP_NOFS is used, the file cache for write operations cannot be allocated from the highmem and CMA. Since mapping->gfp_mask is set to GFP_HIGHUSER_MOVABLE during inode allocation, using mapping_gfp_mask(mapping) as the GFP flag of getting file cache for writing is more efficient for 32-bit architectures. Additionally, use FGP_NOFS to avoid potential deadlock issues caused by GFP_FS in GFP_HIGHUSER_MOVABLE Signed-off-by: Jiucheng Xu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 775aa4f63aa3..16a713013427 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3570,7 +3570,8 @@ repeat: * Will wait that below with our IO control. */ folio = __filemap_get_folio(mapping, index, - FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS); + FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_NOFS, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { err = PTR_ERR(folio); goto fail; From 28b68b2a3b2fae3789717ca9e306ae8f01269849 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Oct 2025 14:27:01 +0800 Subject: [PATCH 05/39] f2fs: clean up w/ bio_add_folio_nofail() In add_bio_entry(), adding a page to newly allocated bio should never fail, let's use bio_add_folio_nofail() instead of bio_add_page() & unnecessary error handling for cleanup. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 16a713013427..d8a9f8f8cb5d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -752,7 +752,7 @@ static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, } static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, - struct page *page, enum temp_type temp) + struct folio *folio, enum temp_type temp) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; struct bio_entry *be; @@ -761,8 +761,7 @@ static void add_bio_entry(struct f2fs_sb_info *sbi, struct bio *bio, be->bio = bio; bio_get(bio); - if (bio_add_page(bio, page, PAGE_SIZE, 0) != PAGE_SIZE) - f2fs_bug_on(sbi, 1); + bio_add_folio_nofail(bio, folio, folio_size(folio), 0); f2fs_down_write(&io->bio_list_lock); list_add_tail(&be->list, &io->bio_list); @@ -904,7 +903,7 @@ alloc_new: f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, folio->index, fio, GFP_NOIO); - add_bio_entry(fio->sbi, bio, &data_folio->page, fio->temp); + add_bio_entry(fio->sbi, bio, data_folio, fio->temp); } else { if (add_ipu_page(fio, &bio, &data_folio->page)) goto alloc_new; From 3b7e73ddc07f77eeb67474354b44ec7fed8e8b56 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Oct 2025 14:27:02 +0800 Subject: [PATCH 06/39] f2fs: convert add_ipu_page() to use folio No logic changes. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d8a9f8f8cb5d..68151617d313 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -775,7 +775,7 @@ static void del_bio_entry(struct bio_entry *be) } static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, - struct page *page) + struct folio *folio) { struct folio *fio_folio = fio->folio; struct f2fs_sb_info *sbi = fio->sbi; @@ -801,8 +801,7 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, if (f2fs_crypt_mergeable_bio(*bio, fio_folio->mapping->host, fio_folio->index, fio) && - bio_add_page(*bio, page, PAGE_SIZE, 0) == - PAGE_SIZE) { + bio_add_folio(*bio, folio, folio_size(folio), 0)) { ret = 0; break; } @@ -905,7 +904,7 @@ alloc_new: add_bio_entry(fio->sbi, bio, data_folio, fio->temp); } else { - if (add_ipu_page(fio, &bio, &data_folio->page)) + if (add_ipu_page(fio, &bio, data_folio)) goto alloc_new; } From e0b89d00ea9f846da42fc92f200c96254d0e2fef Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Oct 2025 14:27:03 +0800 Subject: [PATCH 07/39] f2fs: use f2fs_filemap_get_folio() instead of f2fs_pagecache_get_page() Let's use f2fs_filemap_get_folio() instead of f2fs_pagecache_get_page() in ra_data_block() and move_data_block(), then remove f2fs_pagecache_get_page() since it has no user. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 ---------- fs/f2fs/gc.c | 23 +++++++++++++---------- 2 files changed, 13 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index b6e35fdd5fd3..9cc3b83b8d10 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2961,16 +2961,6 @@ static inline struct folio *f2fs_filemap_get_folio( return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask); } -static inline struct page *f2fs_pagecache_get_page( - struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp_mask) -{ - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) - return NULL; - - return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); -} - static inline void f2fs_folio_put(struct folio *folio, bool unlock) { if (IS_ERR_OR_NULL(folio)) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 8abf521530ff..22fe6e2c6d5c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1211,7 +1211,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = f2fs_is_cow_file(inode) ? F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; struct dnode_of_data dn; - struct folio *folio; + struct folio *folio, *efolio; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1266,14 +1266,15 @@ got_it: f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); - fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(sbi), - dn.data_blkaddr, + efolio = f2fs_filemap_get_folio(META_MAPPING(sbi), dn.data_blkaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); - if (!fio.encrypted_page) { - err = -ENOMEM; + if (IS_ERR(efolio)) { + err = PTR_ERR(efolio); goto put_folio; } + fio.encrypted_page = &efolio->page; + err = f2fs_submit_page_bio(&fio); if (err) goto put_encrypted_page; @@ -1313,7 +1314,7 @@ static int move_data_block(struct inode *inode, block_t bidx, struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; - struct folio *folio, *mfolio; + struct folio *folio, *mfolio, *efolio; block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); @@ -1407,14 +1408,16 @@ static int move_data_block(struct inode *inode, block_t bidx, goto up_out; } - fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi), - newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); - if (!fio.encrypted_page) { - err = -ENOMEM; + efolio = f2fs_filemap_get_folio(META_MAPPING(fio.sbi), newaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (IS_ERR(efolio)) { + err = PTR_ERR(efolio); f2fs_folio_put(mfolio, true); goto recover_block; } + fio.encrypted_page = &efolio->page; + /* write target block */ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); memcpy(page_address(fio.encrypted_page), From c1cdb0048832e84549cd24964ba6bdd71d44a4ae Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Oct 2025 14:27:04 +0800 Subject: [PATCH 08/39] f2fs: use f2fs_filemap_get_folio() to support fault injection Use f2fs_filemap_get_folio() instead of __filemap_get_folio() in: - f2fs_find_data_folio - f2fs_write_begin - f2fs_read_merkle_tree_page So that, we can trigger fault injection in those places. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 4 ++-- fs/f2fs/verity.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 68151617d313..eec691262fec 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1273,7 +1273,7 @@ struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct folio *folio; - folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0); + folio = f2fs_filemap_get_folio(mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio)) goto read; if (folio_test_uptodate(folio)) @@ -3567,7 +3567,7 @@ repeat: * Do not use FGP_STABLE to avoid deadlock. * Will wait that below with our IO control. */ - folio = __filemap_get_folio(mapping, index, + folio = f2fs_filemap_get_folio(mapping, index, FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index f0ab9a3c7a82..05b935b55216 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -263,7 +263,7 @@ static struct page *f2fs_read_merkle_tree_page(struct inode *inode, index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT; - folio = __filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); + folio = f2fs_filemap_get_folio(inode->i_mapping, index, FGP_ACCESSED, 0); if (IS_ERR(folio) || !folio_test_uptodate(folio)) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, index); From ca8b201f28547e28343a6f00a6e91fa8c09572fe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 14 Oct 2025 19:47:35 +0800 Subject: [PATCH 09/39] f2fs: fix to avoid potential deadlock As Jiaming Zhang and syzbot reported, there is potential deadlock in f2fs as below: Chain exists of: &sbi->cp_rwsem --> fs_reclaim --> sb_internal#2 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- rlock(sb_internal#2); lock(fs_reclaim); lock(sb_internal#2); rlock(&sbi->cp_rwsem); *** DEADLOCK *** 3 locks held by kswapd0/73: #0: ffffffff8e247a40 (fs_reclaim){+.+.}-{0:0}, at: balance_pgdat mm/vmscan.c:7015 [inline] #0: ffffffff8e247a40 (fs_reclaim){+.+.}-{0:0}, at: kswapd+0x951/0x2800 mm/vmscan.c:7389 #1: ffff8880118400e0 (&type->s_umount_key#50){.+.+}-{4:4}, at: super_trylock_shared fs/super.c:562 [inline] #1: ffff8880118400e0 (&type->s_umount_key#50){.+.+}-{4:4}, at: super_cache_scan+0x91/0x4b0 fs/super.c:197 #2: ffff888011840610 (sb_internal#2){.+.+}-{0:0}, at: f2fs_evict_inode+0x8d9/0x1b60 fs/f2fs/inode.c:890 stack backtrace: CPU: 0 UID: 0 PID: 73 Comm: kswapd0 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_circular_bug+0x2ee/0x310 kernel/locking/lockdep.c:2043 check_noncircular+0x134/0x160 kernel/locking/lockdep.c:2175 check_prev_add kernel/locking/lockdep.c:3165 [inline] check_prevs_add kernel/locking/lockdep.c:3284 [inline] validate_chain+0xb9b/0x2140 kernel/locking/lockdep.c:3908 __lock_acquire+0xab9/0xd20 kernel/locking/lockdep.c:5237 lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5868 down_read+0x46/0x2e0 kernel/locking/rwsem.c:1537 f2fs_down_read fs/f2fs/f2fs.h:2278 [inline] f2fs_lock_op fs/f2fs/f2fs.h:2357 [inline] f2fs_do_truncate_blocks+0x21c/0x10c0 fs/f2fs/file.c:791 f2fs_truncate_blocks+0x10a/0x300 fs/f2fs/file.c:867 f2fs_truncate+0x489/0x7c0 fs/f2fs/file.c:925 f2fs_evict_inode+0x9f2/0x1b60 fs/f2fs/inode.c:897 evict+0x504/0x9c0 fs/inode.c:810 f2fs_evict_inode+0x1dc/0x1b60 fs/f2fs/inode.c:853 evict+0x504/0x9c0 fs/inode.c:810 dispose_list fs/inode.c:852 [inline] prune_icache_sb+0x21b/0x2c0 fs/inode.c:1000 super_cache_scan+0x39b/0x4b0 fs/super.c:224 do_shrink_slab+0x6ef/0x1110 mm/shrinker.c:437 shrink_slab_memcg mm/shrinker.c:550 [inline] shrink_slab+0x7ef/0x10d0 mm/shrinker.c:628 shrink_one+0x28a/0x7c0 mm/vmscan.c:4955 shrink_many mm/vmscan.c:5016 [inline] lru_gen_shrink_node mm/vmscan.c:5094 [inline] shrink_node+0x315d/0x3780 mm/vmscan.c:6081 kswapd_shrink_node mm/vmscan.c:6941 [inline] balance_pgdat mm/vmscan.c:7124 [inline] kswapd+0x147c/0x2800 mm/vmscan.c:7389 kthread+0x70e/0x8a0 kernel/kthread.c:463 ret_from_fork+0x4bc/0x870 arch/x86/kernel/process.c:158 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 The root cause is deadlock among four locks as below: kswapd - fs_reclaim --- Lock A - shrink_one - evict - f2fs_evict_inode - sb_start_intwrite --- Lock B - iput - evict - f2fs_evict_inode - sb_start_intwrite --- Lock B - f2fs_truncate - f2fs_truncate_blocks - f2fs_do_truncate_blocks - f2fs_lock_op --- Lock C ioctl - f2fs_ioc_commit_atomic_write - f2fs_lock_op --- Lock C - __f2fs_commit_atomic_write - __replace_atomic_write_block - f2fs_get_dnode_of_data - __get_node_folio - f2fs_check_nid_range - f2fs_handle_error - f2fs_record_errors - f2fs_down_write --- Lock D open - do_open - do_truncate - security_inode_need_killpriv - f2fs_getxattr - lookup_all_xattrs - f2fs_handle_error - f2fs_record_errors - f2fs_down_write --- Lock D - f2fs_commit_super - read_mapping_folio - filemap_alloc_folio_noprof - prepare_alloc_pages - fs_reclaim_acquire --- Lock A In order to avoid such deadlock, we need to avoid grabbing sb_lock in f2fs_handle_error(), so, let's use asynchronous method instead: - remove f2fs_handle_error() implementation - rename f2fs_handle_error_async() to f2fs_handle_error() - spread f2fs_handle_error() Fixes: 95fa90c9e5a7 ("f2fs: support recording errors into superblock") Cc: stable@kernel.org Reported-by: syzbot+14b90e1156b9f6fc1266@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/68eae49b.050a0220.ac43.0001.GAE@google.com Reported-by: Jiaming Zhang Closes: https://lore.kernel.org/lkml/CANypQFa-Gy9sD-N35o3PC+FystOWkNuN8pv6S75HLT0ga-Tzgw@mail.gmail.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 5 +---- fs/f2fs/f2fs.h | 1 - fs/f2fs/super.c | 41 ----------------------------------------- 3 files changed, 1 insertion(+), 46 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 6ad8d3bc6df7..811bfe38e5c0 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -759,10 +759,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) ret = -EFSCORRUPTED; /* Avoid f2fs_commit_super in irq context */ - if (!in_task) - f2fs_handle_error_async(sbi, ERROR_FAIL_DECOMPRESSION); - else - f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); + f2fs_handle_error(sbi, ERROR_FAIL_DECOMPRESSION); goto out_release; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9cc3b83b8d10..575f9666c3b7 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -3800,7 +3800,6 @@ void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason); void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error); -void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error); int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); int f2fs_sync_fs(struct super_block *sb, int sync); int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index db7afb806411..cb65ca90f9f6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4544,48 +4544,7 @@ void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag) spin_unlock_irqrestore(&sbi->error_lock, flags); } -static bool f2fs_update_errors(struct f2fs_sb_info *sbi) -{ - unsigned long flags; - bool need_update = false; - - spin_lock_irqsave(&sbi->error_lock, flags); - if (sbi->error_dirty) { - memcpy(F2FS_RAW_SUPER(sbi)->s_errors, sbi->errors, - MAX_F2FS_ERRORS); - sbi->error_dirty = false; - need_update = true; - } - spin_unlock_irqrestore(&sbi->error_lock, flags); - - return need_update; -} - -static void f2fs_record_errors(struct f2fs_sb_info *sbi, unsigned char error) -{ - int err; - - f2fs_down_write(&sbi->sb_lock); - - if (!f2fs_update_errors(sbi)) - goto out_unlock; - - err = f2fs_commit_super(sbi, false); - if (err) - f2fs_err_ratelimited(sbi, - "f2fs_commit_super fails to record errors:%u, err:%d", - error, err); -out_unlock: - f2fs_up_write(&sbi->sb_lock); -} - void f2fs_handle_error(struct f2fs_sb_info *sbi, unsigned char error) -{ - f2fs_save_errors(sbi, error); - f2fs_record_errors(sbi, error); -} - -void f2fs_handle_error_async(struct f2fs_sb_info *sbi, unsigned char error) { f2fs_save_errors(sbi, error); From 7c37c79510329cd951a4dedf3f7bf7e2b18dccec Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 20 Oct 2025 10:42:12 +0800 Subject: [PATCH 10/39] f2fs: fix to avoid updating zero-sized extent in extent cache As syzbot reported: F2FS-fs (loop0): __update_extent_tree_range: extent len is zero, type: 0, extent [0, 0, 0], age [0, 0] ------------[ cut here ]------------ kernel BUG at fs/f2fs/extent_cache.c:678! Oops: invalid opcode: 0000 [#1] SMP KASAN NOPTI CPU: 0 UID: 0 PID: 5336 Comm: syz.0.0 Not tainted syzkaller #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:__update_extent_tree_range+0x13bc/0x1500 fs/f2fs/extent_cache.c:678 Call Trace: f2fs_update_read_extent_cache_range+0x192/0x3e0 fs/f2fs/extent_cache.c:1085 f2fs_do_zero_range fs/f2fs/file.c:1657 [inline] f2fs_zero_range+0x10c1/0x1580 fs/f2fs/file.c:1737 f2fs_fallocate+0x583/0x990 fs/f2fs/file.c:2030 vfs_fallocate+0x669/0x7e0 fs/open.c:342 ioctl_preallocate fs/ioctl.c:289 [inline] file_ioctl+0x611/0x780 fs/ioctl.c:-1 do_vfs_ioctl+0xb33/0x1430 fs/ioctl.c:576 __do_sys_ioctl fs/ioctl.c:595 [inline] __se_sys_ioctl+0x82/0x170 fs/ioctl.c:583 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xfa/0x3b0 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f07bc58eec9 In error path of f2fs_zero_range(), it may add a zero-sized extent into extent cache, it should be avoided. Fixes: 6e9619499f53 ("f2fs: support in batch fzero in dnode page") Cc: stable@kernel.org Reported-by: syzbot+24124df3170c3638b35f@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-f2fs-devel/68e5d698.050a0220.256323.0032.GAE@google.com Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ffa045b39c01..c045e38e60ee 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1654,8 +1654,11 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, f2fs_set_data_blkaddr(dn, NEW_ADDR); } - f2fs_update_read_extent_cache_range(dn, start, 0, index - start); - f2fs_update_age_extent_cache_range(dn, start, index - start); + if (index > start) { + f2fs_update_read_extent_cache_range(dn, start, 0, + index - start); + f2fs_update_age_extent_cache_range(dn, start, index - start); + } return ret; } From 10b591e7fb7cdc8c1e53e9c000dc0ef7069aaa76 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 22 Oct 2025 11:06:36 +0800 Subject: [PATCH 11/39] f2fs: fix to avoid updating compression context during writeback Bai, Shuangpeng reported a bug as below: Oops: divide error: 0000 [#1] SMP KASAN PTI CPU: 0 UID: 0 PID: 11441 Comm: syz.0.46 Not tainted 6.17.0 #1 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 RIP: 0010:f2fs_all_cluster_page_ready+0x106/0x550 fs/f2fs/compress.c:857 Call Trace: f2fs_write_cache_pages fs/f2fs/data.c:3078 [inline] __f2fs_write_data_pages fs/f2fs/data.c:3290 [inline] f2fs_write_data_pages+0x1c19/0x3600 fs/f2fs/data.c:3317 do_writepages+0x38e/0x640 mm/page-writeback.c:2634 filemap_fdatawrite_wbc mm/filemap.c:386 [inline] __filemap_fdatawrite_range mm/filemap.c:419 [inline] file_write_and_wait_range+0x2ba/0x3e0 mm/filemap.c:794 f2fs_do_sync_file+0x6e6/0x1b00 fs/f2fs/file.c:294 generic_write_sync include/linux/fs.h:3043 [inline] f2fs_file_write_iter+0x76e/0x2700 fs/f2fs/file.c:5259 new_sync_write fs/read_write.c:593 [inline] vfs_write+0x7e9/0xe00 fs/read_write.c:686 ksys_write+0x19d/0x2d0 fs/read_write.c:738 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf7/0x470 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f The bug was triggered w/ below race condition: fsync setattr ioctl - f2fs_do_sync_file - file_write_and_wait_range - f2fs_write_cache_pages : inode is non-compressed : cc.cluster_size = F2FS_I(inode)->i_cluster_size = 0 - tag_pages_for_writeback - f2fs_setattr - truncate_setsize - f2fs_truncate - f2fs_fileattr_set - f2fs_setflags_common - set_compress_context : F2FS_I(inode)->i_cluster_size = 4 : set_inode_flag(inode, FI_COMPRESSED_FILE) - f2fs_compressed_file : return true - f2fs_all_cluster_page_ready : "pgidx % cc->cluster_size" trigger dividing 0 issue Let's change as below to fix this issue: - introduce a new atomic type variable .writeback in structure f2fs_inode_info to track the number of threads which calling f2fs_write_cache_pages(). - use .i_sem lock to protect .writeback update. - check .writeback before update compression context in f2fs_setflags_common() to avoid race w/ ->writepages. Fixes: 4c8ff7095bef ("f2fs: support data compression") Cc: stable@kernel.org Reported-by: Bai, Shuangpeng Tested-by: Bai, Shuangpeng Closes: https://lore.kernel.org/lkml/44D8F7B3-68AD-425F-9915-65D27591F93F@psu.edu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 17 +++++++++++++++++ fs/f2fs/f2fs.h | 3 ++- fs/f2fs/file.c | 5 +++-- fs/f2fs/super.c | 1 + 4 files changed, 23 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index eec691262fec..b92d362a02d6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3222,6 +3222,19 @@ static inline bool __should_serialize_io(struct inode *inode, return false; } +static inline void account_writeback(struct inode *inode, bool inc) +{ + if (!f2fs_sb_has_compression(F2FS_I_SB(inode))) + return; + + f2fs_down_read(&F2FS_I(inode)->i_sem); + if (inc) + atomic_inc(&F2FS_I(inode)->writeback); + else + atomic_dec(&F2FS_I(inode)->writeback); + f2fs_up_read(&F2FS_I(inode)->i_sem); +} + static int __f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc, enum iostat_type io_type) @@ -3267,10 +3280,14 @@ static int __f2fs_write_data_pages(struct address_space *mapping, locked = true; } + account_writeback(inode, true); + blk_start_plug(&plug); ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + account_writeback(inode, false); + if (locked) mutex_unlock(&sbi->writepages); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 575f9666c3b7..e69b01c1173a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -947,6 +947,7 @@ struct f2fs_inode_info { unsigned char i_compress_level; /* compress level (lz4hc,zstd) */ unsigned char i_compress_flag; /* compress flag */ unsigned int i_cluster_size; /* cluster size */ + atomic_t writeback; /* count # of writeback thread */ unsigned int atomic_write_cnt; loff_t original_i_size; /* original i_size before atomic write */ @@ -4663,7 +4664,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) f2fs_up_write(&fi->i_sem); return true; } - if (f2fs_is_mmap_file(inode) || + if (f2fs_is_mmap_file(inode) || atomic_read(&fi->writeback) || (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode))) { f2fs_up_write(&fi->i_sem); return false; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c045e38e60ee..6d42e2d28861 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2128,8 +2128,9 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) f2fs_down_write(&fi->i_sem); if (!f2fs_may_compress(inode) || - (S_ISREG(inode->i_mode) && - F2FS_HAS_BLOCKS(inode))) { + atomic_read(&fi->writeback) || + (S_ISREG(inode->i_mode) && + F2FS_HAS_BLOCKS(inode))) { f2fs_up_write(&fi->i_sem); return -EINVAL; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cb65ca90f9f6..d0b5791a1f8c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1759,6 +1759,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) atomic_set(&fi->dirty_pages, 0); atomic_set(&fi->i_compr_blocks, 0); atomic_set(&fi->open_count, 0); + atomic_set(&fi->writeback, 0); init_f2fs_rwsem(&fi->i_sem); spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); From 1f27ef42bb0b7c0740c5616ec577ec188b8a1d05 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 21 Oct 2025 11:48:56 +0800 Subject: [PATCH 12/39] f2fs: use global inline_xattr_slab instead of per-sb slab cache MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As Hong Yun reported in mailing list: loop7: detected capacity change from 0 to 131072 ------------[ cut here ]------------ kmem_cache of name 'f2fs_xattr_entry-7:7' already exists WARNING: CPU: 0 PID: 24426 at mm/slab_common.c:110 kmem_cache_sanity_check mm/slab_common.c:109 [inline] WARNING: CPU: 0 PID: 24426 at mm/slab_common.c:110 __kmem_cache_create_args+0xa6/0x320 mm/slab_common.c:307 CPU: 0 UID: 0 PID: 24426 Comm: syz.7.1370 Not tainted 6.17.0-rc4 #1 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.13.0-1ubuntu1.1 04/01/2014 RIP: 0010:kmem_cache_sanity_check mm/slab_common.c:109 [inline] RIP: 0010:__kmem_cache_create_args+0xa6/0x320 mm/slab_common.c:307 Call Trace:  __kmem_cache_create include/linux/slab.h:353 [inline]  f2fs_kmem_cache_create fs/f2fs/f2fs.h:2943 [inline]  f2fs_init_xattr_caches+0xa5/0xe0 fs/f2fs/xattr.c:843  f2fs_fill_super+0x1645/0x2620 fs/f2fs/super.c:4918  get_tree_bdev_flags+0x1fb/0x260 fs/super.c:1692  vfs_get_tree+0x43/0x140 fs/super.c:1815  do_new_mount+0x201/0x550 fs/namespace.c:3808  do_mount fs/namespace.c:4136 [inline]  __do_sys_mount fs/namespace.c:4347 [inline]  __se_sys_mount+0x298/0x2f0 fs/namespace.c:4324  do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline]  do_syscall_64+0x8e/0x3a0 arch/x86/entry/syscall_64.c:94  entry_SYSCALL_64_after_hwframe+0x76/0x7e The bug can be reproduced w/ below scripts: - mount /dev/vdb /mnt1 - mount /dev/vdc /mnt2 - umount /mnt1 - mounnt /dev/vdb /mnt1 The reason is if we created two slab caches, named f2fs_xattr_entry-7:3 and f2fs_xattr_entry-7:7, and they have the same slab size. Actually, slab system will only create one slab cache core structure which has slab name of "f2fs_xattr_entry-7:3", and two slab caches share the same structure and cache address. So, if we destroy f2fs_xattr_entry-7:3 cache w/ cache address, it will decrease reference count of slab cache, rather than release slab cache entirely, since there is one more user has referenced the cache. Then, if we try to create slab cache w/ name "f2fs_xattr_entry-7:3" again, slab system will find that there is existed cache which has the same name and trigger the warning. Let's changes to use global inline_xattr_slab instead of per-sb slab cache for fixing. Fixes: a999150f4fe3 ("f2fs: use kmem_cache pool during inline xattr lookups") Cc: stable@kernel.org Reported-by: Hong Yun Tested-by: Hong Yun Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 --- fs/f2fs/super.c | 17 ++++++++--------- fs/f2fs/xattr.c | 32 +++++++++++--------------------- fs/f2fs/xattr.h | 10 ++++++---- 4 files changed, 25 insertions(+), 37 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e69b01c1173a..9ca2124aac84 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1885,9 +1885,6 @@ struct f2fs_sb_info { spinlock_t error_lock; /* protect errors/stop_reason array */ bool error_dirty; /* errors of sb is dirty */ - struct kmem_cache *inline_xattr_slab; /* inline xattr entry */ - unsigned int inline_xattr_slab_size; /* default inline xattr slab size */ - /* For reclaimed segs statistics per each GC mode */ unsigned int gc_segment_mode; /* GC state for reclaimed segments */ unsigned int gc_reclaimed_segs[MAX_GC_MODE]; /* Reclaimed segs for each mode */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d0b5791a1f8c..f76ba2b08be0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2027,7 +2027,6 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); f2fs_destroy_page_array_cache(sbi); - f2fs_destroy_xattr_caches(sbi); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(F2FS_OPTION(sbi).s_qf_names[i]); @@ -4975,13 +4974,9 @@ try_onemore: if (err) goto free_iostat; - /* init per sbi slab cache */ - err = f2fs_init_xattr_caches(sbi); - if (err) - goto free_percpu; err = f2fs_init_page_array_cache(sbi); if (err) - goto free_xattr_cache; + goto free_percpu; /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); @@ -5310,8 +5305,6 @@ free_meta_inode: sbi->meta_inode = NULL; free_page_array_cache: f2fs_destroy_page_array_cache(sbi); -free_xattr_cache: - f2fs_destroy_xattr_caches(sbi); free_percpu: destroy_percpu_info(sbi); free_iostat: @@ -5514,10 +5507,15 @@ static int __init init_f2fs_fs(void) err = f2fs_create_casefold_cache(); if (err) goto free_compress_cache; - err = register_filesystem(&f2fs_fs_type); + err = f2fs_init_xattr_cache(); if (err) goto free_casefold_cache; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_xattr_cache; return 0; +free_xattr_cache: + f2fs_destroy_xattr_cache(); free_casefold_cache: f2fs_destroy_casefold_cache(); free_compress_cache: @@ -5558,6 +5556,7 @@ fail: static void __exit exit_f2fs_fs(void) { unregister_filesystem(&f2fs_fs_type); + f2fs_destroy_xattr_cache(); f2fs_destroy_casefold_cache(); f2fs_destroy_compress_cache(); f2fs_destroy_compress_mempool(); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 58632a2b6613..b4e5c406632f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -23,11 +23,12 @@ #include "xattr.h" #include "segment.h" +static struct kmem_cache *inline_xattr_slab; static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline) { - if (likely(size == sbi->inline_xattr_slab_size)) { + if (likely(size == DEFAULT_XATTR_SLAB_SIZE)) { *is_inline = true; - return f2fs_kmem_cache_alloc(sbi->inline_xattr_slab, + return f2fs_kmem_cache_alloc(inline_xattr_slab, GFP_F2FS_ZERO, false, sbi); } *is_inline = false; @@ -38,7 +39,7 @@ static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr, bool is_inline) { if (is_inline) - kmem_cache_free(sbi->inline_xattr_slab, xattr_addr); + kmem_cache_free(inline_xattr_slab, xattr_addr); else kfree(xattr_addr); } @@ -830,25 +831,14 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, return err; } -int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) +int __init f2fs_init_xattr_cache(void) { - dev_t dev = sbi->sb->s_bdev->bd_dev; - char slab_name[32]; - - sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev)); - - sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size * - sizeof(__le32) + XATTR_PADDING_SIZE; - - sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name, - sbi->inline_xattr_slab_size); - if (!sbi->inline_xattr_slab) - return -ENOMEM; - - return 0; + inline_xattr_slab = f2fs_kmem_cache_create("f2fs_xattr_entry", + DEFAULT_XATTR_SLAB_SIZE); + return inline_xattr_slab ? 0 : -ENOMEM; } -void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) +void f2fs_destroy_xattr_cache(void) { - kmem_cache_destroy(sbi->inline_xattr_slab); -} + kmem_cache_destroy(inline_xattr_slab); +} \ No newline at end of file diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 4fc0b2305fbd..bce3d93e4755 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -89,6 +89,8 @@ struct f2fs_xattr_entry { F2FS_TOTAL_EXTRA_ATTR_SIZE / sizeof(__le32) - \ DEF_INLINE_RESERVED_SIZE - \ MIN_INLINE_DENTRY_SIZE / sizeof(__le32)) +#define DEFAULT_XATTR_SLAB_SIZE (DEFAULT_INLINE_XATTR_ADDRS * \ + sizeof(__le32) + XATTR_PADDING_SIZE) /* * On-disk structure of f2fs_xattr @@ -132,8 +134,8 @@ int f2fs_setxattr(struct inode *, int, const char *, const void *, int f2fs_getxattr(struct inode *, int, const char *, void *, size_t, struct folio *); ssize_t f2fs_listxattr(struct dentry *, char *, size_t); -int f2fs_init_xattr_caches(struct f2fs_sb_info *); -void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); +int __init f2fs_init_xattr_cache(void); +void f2fs_destroy_xattr_cache(void); #else #define f2fs_xattr_handlers NULL @@ -150,8 +152,8 @@ static inline int f2fs_getxattr(struct inode *inode, int index, { return -EOPNOTSUPP; } -static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; } -static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } +static inline int __init f2fs_init_xattr_cache(void) { return 0; } +static inline void f2fs_destroy_xattr_cache(void) { } #endif #ifdef CONFIG_F2FS_FS_SECURITY From d33f89b34aa313f50f9a512d58dd288999f246b0 Mon Sep 17 00:00:00 2001 From: Deepanshu Kartikey Date: Mon, 27 Oct 2025 18:36:34 +0530 Subject: [PATCH 13/39] f2fs: invalidate dentry cache on failed whiteout creation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit F2FS can mount filesystems with corrupted directory depth values that get runtime-clamped to MAX_DIR_HASH_DEPTH. When RENAME_WHITEOUT operations are performed on such directories, f2fs_rename performs directory modifications (updating target entry and deleting source entry) before attempting to add the whiteout entry via f2fs_add_link. If f2fs_add_link fails due to the corrupted directory structure, the function returns an error to VFS, but the partial directory modifications have already been committed to disk. VFS assumes the entire rename operation failed and does not update the dentry cache, leaving stale mappings. In the error path, VFS does not call d_move() to update the dentry cache. This results in new_dentry still pointing to the old inode (new_inode) which has already had its i_nlink decremented to zero. The stale cache causes subsequent operations to incorrectly reference the freed inode. This causes subsequent operations to use cached dentry information that no longer matches the on-disk state. When a second rename targets the same entry, VFS attempts to decrement i_nlink on the stale inode, which may already have i_nlink=0, triggering a WARNING in drop_nlink(). Example sequence: 1. First rename (RENAME_WHITEOUT): file2 → file1 - f2fs updates file1 entry on disk (points to inode 8) - f2fs deletes file2 entry on disk - f2fs_add_link(whiteout) fails (corrupted directory) - Returns error to VFS - VFS does not call d_move() due to error - VFS cache still has: file1 → inode 7 (stale!) - inode 7 has i_nlink=0 (already decremented) 2. Second rename: file3 → file1 - VFS uses stale cache: file1 → inode 7 - Tries to drop_nlink on inode 7 (i_nlink already 0) - WARNING in drop_nlink() Fix this by explicitly invalidating old_dentry and new_dentry when f2fs_add_link fails during whiteout creation. This forces VFS to refresh from disk on subsequent operations, ensuring cache consistency even when the rename partially succeeds. Reproducer: 1. Mount F2FS image with corrupted i_current_depth 2. renameat2(file2, file1, RENAME_WHITEOUT) 3. renameat2(file3, file1, 0) 4. System triggers WARNING in drop_nlink() Fixes: 7e01e7ad746b ("f2fs: support RENAME_WHITEOUT") Reported-by: syzbot+632cf32276a9a564188d@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=632cf32276a9a564188d Suggested-by: Chao Yu Link: https://lore.kernel.org/all/20251022233349.102728-1-kartikey406@gmail.com/ [v1] Cc: stable@vger.kernel.org Signed-off-by: Deepanshu Kartikey Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index b882771e4699..712479b7b93d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1053,9 +1053,11 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (whiteout) { set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); - if (err) + if (err) { + d_invalidate(old_dentry); + d_invalidate(new_dentry); goto put_out_dir; - + } spin_lock(&whiteout->i_lock); whiteout->i_state &= ~I_LINKABLE; spin_unlock(&whiteout->i_lock); From 89c16629e3136f0972dfa270d9318f07fa1c4053 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Mon, 27 Oct 2025 20:55:43 +0800 Subject: [PATCH 14/39] f2fs: change the unlock parameter of f2fs_put_page to bool Change the type of the unlock parameter of f2fs_put_page to bool. All callers should consistently pass true or false. No logical change. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/compress.c | 8 ++++---- fs/f2fs/data.c | 6 ++---- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 6 +++--- fs/f2fs/inline.c | 4 ++-- fs/f2fs/namei.c | 4 ++-- 6 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 811bfe38e5c0..716004ba44dc 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -120,7 +120,7 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) } static void f2fs_put_rpages_wbc(struct compress_ctx *cc, - struct writeback_control *wbc, bool redirty, int unlock) + struct writeback_control *wbc, bool redirty, bool unlock) { unsigned int i; @@ -1202,7 +1202,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, if (copied) set_cluster_dirty(&cc); - f2fs_put_rpages_wbc(&cc, NULL, false, 1); + f2fs_put_rpages_wbc(&cc, NULL, false, true); f2fs_destroy_compress_ctx(&cc, false); return first_index; @@ -1605,7 +1605,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, add_compr_block_stat(cc->inode, cc->cluster_size); goto write; } else if (err) { - f2fs_put_rpages_wbc(cc, wbc, true, 1); + f2fs_put_rpages_wbc(cc, wbc, true, true); goto destroy_out; } @@ -1619,7 +1619,7 @@ write: f2fs_bug_on(F2FS_I_SB(cc->inode), *submitted); err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); - f2fs_put_rpages_wbc(cc, wbc, false, 0); + f2fs_put_rpages_wbc(cc, wbc, false, false); destroy_out: f2fs_destroy_compress_ctx(cc, false); return err; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b92d362a02d6..ee519de1aa1a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3656,8 +3656,7 @@ repeat: return 0; put_folio: - folio_unlock(folio); - folio_put(folio); + f2fs_folio_put(folio, true); fail: f2fs_write_failed(inode, pos + len); return err; @@ -3713,8 +3712,7 @@ static int f2fs_write_end(const struct kiocb *iocb, pos + copied); } unlock_out: - folio_unlock(folio); - folio_put(folio); + f2fs_folio_put(folio, true); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9ca2124aac84..d4f5648477de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2971,7 +2971,7 @@ static inline void f2fs_folio_put(struct folio *folio, bool unlock) folio_put(folio); } -static inline void f2fs_put_page(struct page *page, int unlock) +static inline void f2fs_put_page(struct page *page, bool unlock) { if (!page) return; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 22fe6e2c6d5c..fd8bb0424bf3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1278,7 +1278,7 @@ got_it: err = f2fs_submit_page_bio(&fio); if (err) goto put_encrypted_page; - f2fs_put_page(fio.encrypted_page, 0); + f2fs_put_page(fio.encrypted_page, false); f2fs_folio_put(folio, true); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); @@ -1286,7 +1286,7 @@ got_it: return 0; put_encrypted_page: - f2fs_put_page(fio.encrypted_page, 1); + f2fs_put_page(fio.encrypted_page, true); put_folio: f2fs_folio_put(folio, true); return err; @@ -1442,7 +1442,7 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); - f2fs_put_page(fio.encrypted_page, 1); + f2fs_put_page(fio.encrypted_page, true); recover_block: if (err) f2fs_do_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 58ac831ef704..e5c6a08b7e4f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -287,7 +287,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) set_inode_flag(inode, FI_DATA_EXIST); folio_clear_f2fs_inline(ifolio); - f2fs_folio_put(ifolio, 1); + f2fs_folio_put(ifolio, true); return 0; } @@ -577,7 +577,7 @@ recover: f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); folio_mark_dirty(ifolio); - f2fs_folio_put(ifolio, 1); + f2fs_folio_put(ifolio, true); kfree(backup_dentry); return err; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 712479b7b93d..85e2d13841da 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1249,11 +1249,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; out_new_dir: if (new_dir_entry) { - f2fs_folio_put(new_dir_folio, 0); + f2fs_folio_put(new_dir_folio, false); } out_old_dir: if (old_dir_entry) { - f2fs_folio_put(old_dir_folio, 0); + f2fs_folio_put(old_dir_folio, false); } out_new: f2fs_folio_put(new_folio, false); From be112e7449a6e1b54aa9feac618825d154b3a5c7 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Oct 2025 14:35:33 +0800 Subject: [PATCH 15/39] f2fs: fix to propagate error from f2fs_enable_checkpoint() In order to let userspace detect such error rather than suffering silent failure. Fixes: 4354994f097d ("f2fs: checkpoint disabling") Cc: stable@kernel.org Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f76ba2b08be0..60382c9b5293 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2632,10 +2632,11 @@ restore_flag: return err; } -static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) +static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; long long start, writeback, end; + int ret; f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", get_pages(sbi, F2FS_DIRTY_META), @@ -2669,7 +2670,9 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_up_write(&sbi->gc_lock); - f2fs_sync_fs(sbi->sb, 1); + ret = f2fs_sync_fs(sbi->sb, 1); + if (ret) + f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret); /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); @@ -2679,6 +2682,7 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", ktime_ms_delta(writeback, start), ktime_ms_delta(end, writeback)); + return ret; } static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) @@ -2892,7 +2896,9 @@ static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) goto restore_discard; need_enable_checkpoint = true; } else { - f2fs_enable_checkpoint(sbi); + err = f2fs_enable_checkpoint(sbi); + if (err) + goto restore_discard; need_disable_checkpoint = true; } } @@ -2935,7 +2941,8 @@ skip: return 0; restore_checkpoint: if (need_enable_checkpoint) { - f2fs_enable_checkpoint(sbi); + if (f2fs_enable_checkpoint(sbi)) + f2fs_warn(sbi, "checkpoint has not been enabled"); } else if (need_disable_checkpoint) { if (f2fs_disable_checkpoint(sbi)) f2fs_warn(sbi, "checkpoint has not been disabled"); @@ -5212,13 +5219,12 @@ reset_checkpoint: if (err) goto sync_free_meta; - if (test_opt(sbi, DISABLE_CHECKPOINT)) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) err = f2fs_disable_checkpoint(sbi); - if (err) - goto sync_free_meta; - } else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) { - f2fs_enable_checkpoint(sbi); - } + else if (is_set_ckpt_flags(sbi, CP_DISABLED_FLAG)) + err = f2fs_enable_checkpoint(sbi); + if (err) + goto sync_free_meta; /* * If filesystem is not mounted as read-only then From 196c81fdd438f7ac429d5639090a9816abb9760a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 27 Oct 2025 14:35:34 +0800 Subject: [PATCH 16/39] f2fs: block cache/dio write during f2fs_enable_checkpoint() If there are too many background IOs during f2fs_enable_checkpoint(), sync_inodes_sb() may be blocked for long time due to it will loop to write dirty datas which are generated by in parallel write() continuously. Let's change as below to resolve this issue: - hold cp_enable_rwsem write lock to block any cache/dio write - decrease DEF_ENABLE_INTERVAL from 16 to 5 In addition, dump more logs during f2fs_enable_checkpoint(). Testcase: 1. fill data into filesystem until 90% usage. 2. mount -o remount,checkpoint=disable:10% /data 3. fio --rw=randwrite --bs=4kb --size=1GB --numjobs=10 \ --iodepth=64 --ioengine=psync --time_based --runtime=600 \ --directory=/data/fio_dir/ & 4. mount -o remount,checkpoint=enable /data Before: F2FS-fs (dm-51): f2fs_enable_checkpoint() finishes, writeback:7232, sync:39793, cp:457 After: F2FS-fs (dm-51): f2fs_enable_checkpoint end, writeback:5032, lock:0, sync_inode:5552, sync_fs:84 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 ++ fs/f2fs/f2fs.h | 3 ++- fs/f2fs/super.c | 38 ++++++++++++++++++++++++++++++-------- 3 files changed, 34 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ee519de1aa1a..7be0837a9456 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1418,6 +1418,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) static void f2fs_map_lock(struct f2fs_sb_info *sbi, int flag) { + f2fs_down_read(&sbi->cp_enable_rwsem); if (flag == F2FS_GET_BLOCK_PRE_AIO) f2fs_down_read(&sbi->node_change); else @@ -1430,6 +1431,7 @@ static void f2fs_map_unlock(struct f2fs_sb_info *sbi, int flag) f2fs_up_read(&sbi->node_change); else f2fs_unlock_op(sbi); + f2fs_up_read(&sbi->cp_enable_rwsem); } int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4f5648477de..0e2c5e86a6a1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -281,7 +281,7 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ -#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ +#define DEF_ENABLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ @@ -1695,6 +1695,7 @@ struct f2fs_sb_info { long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ struct cp_stats cp_stats; /* for time stat of checkpoint */ + struct f2fs_rwsem cp_enable_rwsem; /* block cache/dio write */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 60382c9b5293..bdb5ddb4f966 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2635,10 +2635,11 @@ restore_flag: static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; - long long start, writeback, end; + long long start, writeback, lock, sync_inode, end; int ret; - f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", + f2fs_info(sbi, "%s start, meta: %lld, node: %lld, data: %lld", + __func__, get_pages(sbi, F2FS_DIRTY_META), get_pages(sbi, F2FS_DIRTY_NODES), get_pages(sbi, F2FS_DIRTY_DATA)); @@ -2657,11 +2658,18 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) } writeback = ktime_get(); - sync_inodes_sb(sbi->sb); + f2fs_down_write(&sbi->cp_enable_rwsem); + + lock = ktime_get(); + + if (get_pages(sbi, F2FS_DIRTY_DATA)) + sync_inodes_sb(sbi->sb); if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) - f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", - get_pages(sbi, F2FS_DIRTY_DATA)); + f2fs_warn(sbi, "%s: has some unwritten data: %lld", + __func__, get_pages(sbi, F2FS_DIRTY_DATA)); + + sync_inode = ktime_get(); f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -2670,6 +2678,13 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) set_sbi_flag(sbi, SBI_IS_DIRTY); f2fs_up_write(&sbi->gc_lock); + f2fs_info(sbi, "%s sync_fs, meta: %lld, imeta: %lld, node: %lld, dents: %lld, qdata: %lld", + __func__, + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_IMETA), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_QDATA)); ret = f2fs_sync_fs(sbi->sb, 1); if (ret) f2fs_err(sbi, "%s sync_fs failed, ret: %d", __func__, ret); @@ -2677,11 +2692,17 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); + f2fs_up_write(&sbi->cp_enable_rwsem); + end = ktime_get(); - f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", - ktime_ms_delta(writeback, start), - ktime_ms_delta(end, writeback)); + f2fs_info(sbi, "%s end, writeback:%llu, " + "lock:%llu, sync_inode:%llu, sync_fs:%llu", + __func__, + ktime_ms_delta(writeback, start), + ktime_ms_delta(lock, writeback), + ktime_ms_delta(sync_inode, lock), + ktime_ms_delta(end, sync_inode)); return ret; } @@ -4870,6 +4891,7 @@ try_onemore: init_f2fs_rwsem(&sbi->node_change); spin_lock_init(&sbi->stat_lock); init_f2fs_rwsem(&sbi->cp_rwsem); + init_f2fs_rwsem(&sbi->cp_enable_rwsem); init_f2fs_rwsem(&sbi->quota_sem); init_waitqueue_head(&sbi->cp_wait); spin_lock_init(&sbi->error_lock); From 297baa4aa263ff8f5b3d246ee16a660d76aa82c4 Mon Sep 17 00:00:00 2001 From: Jan Prusakowski Date: Mon, 6 Oct 2025 10:46:15 +0200 Subject: [PATCH 17/39] f2fs: ensure node page reads complete before f2fs_put_super() finishes Xfstests generic/335, generic/336 sometimes crash with the following message: F2FS-fs (dm-0): detect filesystem reference count leak during umount, type: 9, count: 1 ------------[ cut here ]------------ kernel BUG at fs/f2fs/super.c:1939! Oops: invalid opcode: 0000 [#1] SMP NOPTI CPU: 1 UID: 0 PID: 609351 Comm: umount Tainted: G W 6.17.0-rc5-xfstests-g9dd1835ecda5 #1 PREEMPT(none) Tainted: [W]=WARN Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:f2fs_put_super+0x3b3/0x3c0 Call Trace: generic_shutdown_super+0x7e/0x190 kill_block_super+0x1a/0x40 kill_f2fs_super+0x9d/0x190 deactivate_locked_super+0x30/0xb0 cleanup_mnt+0xba/0x150 task_work_run+0x5c/0xa0 exit_to_user_mode_loop+0xb7/0xc0 do_syscall_64+0x1ae/0x1c0 entry_SYSCALL_64_after_hwframe+0x76/0x7e ---[ end trace 0000000000000000 ]--- It appears that sometimes it is possible that f2fs_put_super() is called before all node page reads are completed. Adding a call to f2fs_wait_on_all_pages() for F2FS_RD_NODE fixes the problem. Cc: stable@kernel.org Fixes: 20872584b8c0b ("f2fs: fix to drop all dirty meta/node pages during umount()") Signed-off-by: Jan Prusakowski Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index bdb5ddb4f966..0b0ef8ba243b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1989,14 +1989,6 @@ static void f2fs_put_super(struct super_block *sb) truncate_inode_pages_final(META_MAPPING(sbi)); } - for (i = 0; i < NR_COUNT_TYPE; i++) { - if (!get_pages(sbi, i)) - continue; - f2fs_err(sbi, "detect filesystem reference count leak during " - "umount, type: %d, count: %lld", i, get_pages(sbi, i)); - f2fs_bug_on(sbi, 1); - } - f2fs_bug_on(sbi, sbi->fsync_node_num); f2fs_destroy_compress_inode(sbi); @@ -2007,6 +1999,15 @@ static void f2fs_put_super(struct super_block *sb) iput(sbi->meta_inode); sbi->meta_inode = NULL; + /* Should check the page counts after dropping all node/meta pages */ + for (i = 0; i < NR_COUNT_TYPE; i++) { + if (!get_pages(sbi, i)) + continue; + f2fs_err(sbi, "detect filesystem reference count leak during " + "umount, type: %d, count: %lld", i, get_pages(sbi, i)); + f2fs_bug_on(sbi, 1); + } + /* * iput() can update stat information, if f2fs_write_checkpoint() * above failed with error. From 5b5578c3b06eba4c256bc3a2788f5a65cd9f31ea Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 29 Oct 2025 14:31:04 +0800 Subject: [PATCH 18/39] f2fs: fix to access i_size w/ i_size_read() It recommends to use i_size_{read,write}() to access and update i_size, otherwise, we may get wrong tearing value due to high 32-bits value and low 32-bits value of i_size field are not updated atomically in 32-bits archicture machine. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- include/trace/events/f2fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index edbbd869078f..e1fae78d64a5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -204,7 +204,7 @@ DECLARE_EVENT_CLASS(f2fs__inode, __entry->pino = F2FS_I(inode)->i_pino; __entry->mode = inode->i_mode; __entry->nlink = inode->i_nlink; - __entry->size = inode->i_size; + __entry->size = i_size_read(inode); __entry->blocks = inode->i_blocks; __entry->advise = F2FS_I(inode)->i_advise; ), @@ -353,7 +353,7 @@ TRACE_EVENT(f2fs_unlink_enter, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->size = dir->i_size; + __entry->size = i_size_read(dir); __entry->blocks = dir->i_blocks; __assign_str(name); ), @@ -433,7 +433,7 @@ DECLARE_EVENT_CLASS(f2fs__truncate_op, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->ino = inode->i_ino; - __entry->size = inode->i_size; + __entry->size = i_size_read(inode); __entry->blocks = inode->i_blocks; __entry->from = from; ), @@ -1006,7 +1006,7 @@ TRACE_EVENT(f2fs_fallocate, __entry->mode = mode; __entry->offset = offset; __entry->len = len; - __entry->size = inode->i_size; + __entry->size = i_size_read(inode); __entry->blocks = inode->i_blocks; __entry->ret = ret; ), From 392711ef18bff524a873b9c239a73148c5432262 Mon Sep 17 00:00:00 2001 From: Xiaole He Date: Wed, 29 Oct 2025 13:18:07 +0800 Subject: [PATCH 19/39] f2fs: fix uninitialized one_time_gc in victim_sel_policy The one_time_gc field in struct victim_sel_policy is conditionally initialized but unconditionally read, leading to undefined behavior that triggers UBSAN warnings. In f2fs_get_victim() at fs/f2fs/gc.c:774, the victim_sel_policy structure is declared without initialization: struct victim_sel_policy p; The field p.one_time_gc is only assigned when the 'one_time' parameter is true (line 789): if (one_time) { p.one_time_gc = one_time; ... } However, this field is unconditionally read in subsequent get_gc_cost() at line 395: if (p->one_time_gc && (valid_thresh_ratio < 100) && ...) When one_time is false, p.one_time_gc contains uninitialized stack memory. Hence p.one_time_gc is an invalid bool value. UBSAN detects this invalid bool value: UBSAN: invalid-load in fs/f2fs/gc.c:395:7 load of value 77 is not a valid value for type '_Bool' CPU: 3 UID: 0 PID: 1297 Comm: f2fs_gc-252:16 Not tainted 6.18.0-rc3 #5 PREEMPT(voluntary) Hardware name: OpenStack Foundation OpenStack Nova, BIOS 1.13.0-1ubuntu1.1 04/01/2014 Call Trace: dump_stack_lvl+0x70/0x90 dump_stack+0x14/0x20 __ubsan_handle_load_invalid_value+0xb3/0xf0 ? dl_server_update+0x2e/0x40 ? update_curr+0x147/0x170 f2fs_get_victim.cold+0x66/0x134 [f2fs] ? sched_balance_newidle+0x2ca/0x470 ? finish_task_switch.isra.0+0x8d/0x2a0 f2fs_gc+0x2ba/0x8e0 [f2fs] ? _raw_spin_unlock_irqrestore+0x12/0x40 ? __timer_delete_sync+0x80/0xe0 ? timer_delete_sync+0x14/0x20 ? schedule_timeout+0x82/0x100 gc_thread_func+0x38b/0x860 [f2fs] ? gc_thread_func+0x38b/0x860 [f2fs] ? __pfx_autoremove_wake_function+0x10/0x10 kthread+0x10b/0x220 ? __pfx_gc_thread_func+0x10/0x10 [f2fs] ? _raw_spin_unlock_irq+0x12/0x40 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x11a/0x160 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 This issue is reliably reproducible with the following steps on a 100GB SSD /dev/vdb: mkfs.f2fs -f /dev/vdb mount /dev/vdb /mnt/f2fs_test fio --name=gc --directory=/mnt/f2fs_test --rw=randwrite \ --bs=4k --size=8G --numjobs=12 --fsync=4 --runtime=10 \ --time_based echo 1 > /sys/fs/f2fs/vdb/gc_urgent The uninitialized value causes incorrect GC victim selection, leading to unpredictable garbage collection behavior. Fix by zero-initializing the entire victim_sel_policy structure to ensure all fields have defined values. Fixes: e791d00bd06c ("f2fs: add valid block ratio not to do excessive GC for one time GC") Cc: stable@kernel.org Signed-off-by: Xiaole He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fd8bb0424bf3..e04aafee1f2c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -774,7 +774,7 @@ int f2fs_get_victim(struct f2fs_sb_info *sbi, unsigned int *result, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct sit_info *sm = SIT_I(sbi); - struct victim_sel_policy p; + struct victim_sel_policy p = {0}; unsigned int secno, last_victim; unsigned int last_segment; unsigned int nsearched; From d8bdf7856e17b31263bcd37d60903ee36bd2f857 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Fri, 24 Oct 2025 22:37:46 +0800 Subject: [PATCH 20/39] f2fs: ensure minimum trim granularity accounts for all devices When F2FS uses multiple block devices, each device may have a different discard granularity. The minimum trim granularity must be at least the maximum discard granularity of all devices, excluding zoned devices. Use max_t instead of the max() macro to compute the maximum value. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 12 ++++++++++++ fs/f2fs/file.c | 12 ++++++------ 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0e2c5e86a6a1..c85384e8cf42 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4752,6 +4752,18 @@ static inline bool f2fs_hw_support_discard(struct f2fs_sb_info *sbi) return false; } +static inline unsigned int f2fs_hw_discard_granularity(struct f2fs_sb_info *sbi) +{ + int i = 1; + unsigned int discard_granularity = bdev_discard_granularity(sbi->sb->s_bdev); + + if (f2fs_is_multi_device(sbi)) + for (; i < sbi->s_ndevs && !bdev_is_zoned(FDEV(i).bdev); i++) + discard_granularity = max_t(unsigned int, discard_granularity, + bdev_discard_granularity(FDEV(i).bdev)); + return discard_granularity; +} + static inline bool f2fs_realtime_discard_enable(struct f2fs_sb_info *sbi) { return (test_opt(sbi, DISCARD) && f2fs_hw_support_discard(sbi)) || diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 6d42e2d28861..7b966f6d40d2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -2588,14 +2588,14 @@ static int f2fs_keep_noreuse_range(struct inode *inode, static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fstrim_range range; int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!f2fs_hw_support_discard(F2FS_SB(sb))) + if (!f2fs_hw_support_discard(sbi)) return -EOPNOTSUPP; if (copy_from_user(&range, (struct fstrim_range __user *)arg, @@ -2606,9 +2606,9 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (ret) return ret; - range.minlen = max((unsigned int)range.minlen, - bdev_discard_granularity(sb->s_bdev)); - ret = f2fs_trim_fs(F2FS_SB(sb), &range); + range.minlen = max_t(unsigned int, range.minlen, + f2fs_hw_discard_granularity(sbi)); + ret = f2fs_trim_fs(sbi, &range); mnt_drop_write_file(filp); if (ret < 0) return ret; @@ -2616,7 +2616,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; } From 9b3c8336c633ca11778a1ff42b7c37b0563e6430 Mon Sep 17 00:00:00 2001 From: "Nikola Z. Ivanov" Date: Wed, 5 Nov 2025 13:09:42 +0200 Subject: [PATCH 21/39] f2fs: Rename f2fs_unlink exit label Rename "fail" label to "out" as it's used as a default exit path out of f2fs_unlink as well as error path. Signed-off-by: Nikola Z. Ivanov Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 85e2d13841da..0245a2800170 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -552,21 +552,21 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto fail; + goto out; } err = f2fs_dquot_initialize(dir); if (err) - goto fail; + goto out; err = f2fs_dquot_initialize(inode); if (err) - goto fail; + goto out; de = f2fs_find_entry(dir, &dentry->d_name, &folio); if (!de) { if (IS_ERR(folio)) err = PTR_ERR(folio); - goto fail; + goto out; } if (unlikely(inode->i_nlink == 0)) { @@ -575,7 +575,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = -EFSCORRUPTED; set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); f2fs_folio_put(folio, false); - goto fail; + goto out; } f2fs_balance_fs(sbi, true); @@ -585,7 +585,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (err) { f2fs_unlock_op(sbi); f2fs_folio_put(folio, false); - goto fail; + goto out; } f2fs_delete_entry(de, folio, dir, inode); f2fs_unlock_op(sbi); @@ -601,7 +601,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); -fail: +out: trace_f2fs_unlink_exit(inode, err); return err; } From f37981edcd06cd552c15c153c3202a6b2fa450e4 Mon Sep 17 00:00:00 2001 From: "Nikola Z. Ivanov" Date: Wed, 5 Nov 2025 13:09:43 +0200 Subject: [PATCH 22/39] f2fs: Add sanity checks before unlinking and loading inodes Add check for inode->i_nlink == 1 for directories during unlink, as their value is decremented twice, which can trigger a warning in drop_nlink. In such case mark the filesystem as corrupted and return from the function call with the relevant failure return value. Additionally add the check for i_nlink == 1 in sanity_check_inode in order to detect on-disk corruption early. Reported-by: syzbot+c07d47c7bc68f47b9083@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=c07d47c7bc68f47b9083 Tested-by: syzbot+c07d47c7bc68f47b9083@syzkaller.appspotmail.com Signed-off-by: Nikola Z. Ivanov Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 6 ++++++ fs/f2fs/namei.c | 17 ++++++++++++----- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 8c4eafe9ffac..e2405b79b3cc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -294,6 +294,12 @@ static bool sanity_check_inode(struct inode *inode, struct folio *node_folio) return false; } + if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { + f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + __func__, inode->i_ino); + return false; + } + if (f2fs_has_extra_attr(inode)) { if (!f2fs_sb_has_extra_attr(sbi)) { f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0245a2800170..1530314e8b92 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -570,12 +570,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) } if (unlikely(inode->i_nlink == 0)) { - f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + f2fs_warn(sbi, "%s: inode (ino=%lx) has zero i_nlink", __func__, inode->i_ino); - err = -EFSCORRUPTED; - set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); - f2fs_folio_put(folio, false); - goto out; + goto corrupted; + } else if (S_ISDIR(inode->i_mode) && unlikely(inode->i_nlink == 1)) { + f2fs_warn(sbi, "%s: directory inode (ino=%lx) has a single i_nlink", + __func__, inode->i_ino); + goto corrupted; } f2fs_balance_fs(sbi, true); @@ -601,6 +602,12 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + goto out; +corrupted: + err = -EFSCORRUPTED; + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_folio_put(folio, false); out: trace_f2fs_unlink_exit(inode, err); return err; From 27bf6a637b7613fc85fa6af468b7d612d78cd5c0 Mon Sep 17 00:00:00 2001 From: Xiaole He Date: Mon, 27 Oct 2025 17:23:41 +0800 Subject: [PATCH 23/39] f2fs: fix age extent cache insertion skip on counter overflow The age extent cache uses last_blocks (derived from allocated_data_blocks) to determine data age. However, there's a conflict between the deletion marker (last_blocks=0) and legitimate last_blocks=0 cases when allocated_data_blocks overflows to 0 after reaching ULLONG_MAX. In this case, valid extents are incorrectly skipped due to the "if (!tei->last_blocks)" check in __update_extent_tree_range(). This patch fixes the issue by: 1. Reserving ULLONG_MAX as an invalid/deletion marker 2. Limiting allocated_data_blocks to range [0, ULLONG_MAX-1] 3. Using F2FS_EXTENT_AGE_INVALID for deletion scenarios 4. Adjusting overflow age calculation from ULLONG_MAX to (ULLONG_MAX-1) Reproducer (using a patched kernel with allocated_data_blocks initialized to ULLONG_MAX - 3 for quick testing): Step 1: Mount and check initial state # dd if=/dev/zero of=/tmp/test.img bs=1M count=100 # mkfs.f2fs -f /tmp/test.img # mkdir -p /mnt/f2fs_test # mount -t f2fs -o loop,age_extent_cache /tmp/test.img /mnt/f2fs_test # cat /sys/kernel/debug/f2fs/status | grep -A 4 "Block Age" Allocated Data Blocks: 18446744073709551612 # ULLONG_MAX - 3 Inner Struct Count: tree: 1(0), node: 0 Step 2: Create files and write data to trigger overflow # touch /mnt/f2fs_test/{1,2,3,4}.txt; sync # cat /sys/kernel/debug/f2fs/status | grep -A 4 "Block Age" Allocated Data Blocks: 18446744073709551613 # ULLONG_MAX - 2 Inner Struct Count: tree: 5(0), node: 1 # dd if=/dev/urandom of=/mnt/f2fs_test/1.txt bs=4K count=1; sync # cat /sys/kernel/debug/f2fs/status | grep -A 4 "Block Age" Allocated Data Blocks: 18446744073709551614 # ULLONG_MAX - 1 Inner Struct Count: tree: 5(0), node: 2 # dd if=/dev/urandom of=/mnt/f2fs_test/2.txt bs=4K count=1; sync # cat /sys/kernel/debug/f2fs/status | grep -A 4 "Block Age" Allocated Data Blocks: 18446744073709551615 # ULLONG_MAX Inner Struct Count: tree: 5(0), node: 3 # dd if=/dev/urandom of=/mnt/f2fs_test/3.txt bs=4K count=1; sync # cat /sys/kernel/debug/f2fs/status | grep -A 4 "Block Age" Allocated Data Blocks: 0 # Counter overflowed! Inner Struct Count: tree: 5(0), node: 4 Step 3: Trigger the bug - next write should create node but gets skipped # dd if=/dev/urandom of=/mnt/f2fs_test/4.txt bs=4K count=1; sync # cat /sys/kernel/debug/f2fs/status | grep -A 4 "Block Age" Allocated Data Blocks: 1 Inner Struct Count: tree: 5(0), node: 4 Expected: node: 5 (new extent node for 4.txt) Actual: node: 4 (extent insertion was incorrectly skipped due to last_blocks = allocated_data_blocks = 0 in __get_new_block_age) After this fix, the extent node is correctly inserted and node count becomes 5 as expected. Fixes: 71644dff4811 ("f2fs: add block_age-based extent cache") Cc: stable@kernel.org Signed-off-by: Xiaole He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 5 +++-- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/segment.c | 9 +++++++-- 3 files changed, 16 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 33e09c453c70..0ed84cc065a7 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -808,7 +808,7 @@ static void __update_extent_tree_range(struct inode *inode, } goto out_read_extent_cache; update_age_extent_cache: - if (!tei->last_blocks) + if (tei->last_blocks == F2FS_EXTENT_AGE_INVALID) goto out_read_extent_cache; __set_extent_info(&ei, fofs, len, 0, false, @@ -912,7 +912,7 @@ static int __get_new_block_age(struct inode *inode, struct extent_info *ei, cur_age = cur_blocks - tei.last_blocks; else /* allocated_data_blocks overflow */ - cur_age = ULLONG_MAX - tei.last_blocks + cur_blocks; + cur_age = (ULLONG_MAX - 1) - tei.last_blocks + cur_blocks; if (tei.age) ei->age = __calculate_block_age(sbi, cur_age, tei.age); @@ -1114,6 +1114,7 @@ void f2fs_update_age_extent_cache_range(struct dnode_of_data *dn, struct extent_info ei = { .fofs = fofs, .len = len, + .last_blocks = F2FS_EXTENT_AGE_INVALID, }; if (!__may_extent_tree(dn->inode, EX_BLOCK_AGE)) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c85384e8cf42..d9b2777f09ed 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -707,6 +707,12 @@ enum extent_type { NR_EXTENT_CACHES, }; +/* + * Reserved value to mark invalid age extents, hence valid block range + * from 0 to ULLONG_MAX-1 + */ +#define F2FS_EXTENT_AGE_INVALID ULLONG_MAX + struct extent_info { unsigned int fofs; /* start offset in a file */ unsigned int len; /* length of the extent */ diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b45eace879d7..a473cd1fb37d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3863,8 +3863,13 @@ skip_new_segment: locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr)); locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr)); - if (IS_DATASEG(curseg->seg_type)) - atomic64_inc(&sbi->allocated_data_blocks); + if (IS_DATASEG(curseg->seg_type)) { + unsigned long long new_val; + + new_val = atomic64_inc_return(&sbi->allocated_data_blocks); + if (unlikely(new_val == ULLONG_MAX)) + atomic64_set(&sbi->allocated_data_blocks, 0); + } up_write(&sit_i->sentry_lock); From 2e2e0d679a1fb88a960049496373f415b67f274f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 28 Oct 2025 19:50:11 +0000 Subject: [PATCH 24/39] f2fs: add fadvise tracepoint This adds a tracepoint in the fadvise call path. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ include/trace/events/f2fs.h | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7b966f6d40d2..d7047ca6b98d 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -5288,6 +5288,8 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, struct inode *inode = file_inode(filp); int err; + trace_f2fs_fadvise(inode, offset, len, advice); + if (advice == POSIX_FADV_SEQUENTIAL) { if (S_ISFIFO(inode->i_mode)) return -ESPIPE; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index e1fae78d64a5..e00611ead024 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -586,6 +586,38 @@ TRACE_EVENT(f2fs_file_write_iter, __entry->ret) ); +TRACE_EVENT(f2fs_fadvise, + + TP_PROTO(struct inode *inode, loff_t offset, loff_t len, int advice), + + TP_ARGS(inode, offset, len, advice), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(ino_t, ino) + __field(loff_t, size) + __field(loff_t, offset) + __field(loff_t, len) + __field(int, advice) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->size = i_size_read(inode); + __entry->offset = offset; + __entry->len = len; + __entry->advice = advice; + ), + + TP_printk("dev = (%d,%d), ino = %lu, i_size = %lld offset:%llu, len:%llu, advise:%d", + show_dev_ino(__entry), + (unsigned long long)__entry->size, + __entry->offset, + __entry->len, + __entry->advice) +); + TRACE_EVENT(f2fs_map_blocks, TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int flag, int ret), From 01fba45deaddcce0d0b01c411435d1acf6feab7b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Nov 2025 14:50:22 +0800 Subject: [PATCH 25/39] f2fs: fix return value of f2fs_recover_fsync_data() With below scripts, it will trigger panic in f2fs: mkfs.f2fs -f /dev/vdd mount /dev/vdd /mnt/f2fs touch /mnt/f2fs/foo sync echo 111 >> /mnt/f2fs/foo f2fs_io fsync /mnt/f2fs/foo f2fs_io shutdown 2 /mnt/f2fs umount /mnt/f2fs mount -o ro,norecovery /dev/vdd /mnt/f2fs or mount -o ro,disable_roll_forward /dev/vdd /mnt/f2fs F2FS-fs (vdd): f2fs_recover_fsync_data: recovery fsync data, check_only: 0 F2FS-fs (vdd): Mounted with checkpoint version = 7f5c361f F2FS-fs (vdd): Stopped filesystem due to reason: 0 F2FS-fs (vdd): f2fs_recover_fsync_data: recovery fsync data, check_only: 1 Filesystem f2fs get_tree() didn't set fc->root, returned 1 ------------[ cut here ]------------ kernel BUG at fs/super.c:1761! Oops: invalid opcode: 0000 [#1] SMP PTI CPU: 3 UID: 0 PID: 722 Comm: mount Not tainted 6.18.0-rc2+ #721 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 RIP: 0010:vfs_get_tree.cold+0x18/0x1a Call Trace: fc_mount+0x13/0xa0 path_mount+0x34e/0xc50 __x64_sys_mount+0x121/0x150 do_syscall_64+0x84/0x800 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fa6cc126cfe The root cause is we missed to handle error number returned from f2fs_recover_fsync_data() when mounting image w/ ro,norecovery or ro,disable_roll_forward mount option, result in returning a positive error number to vfs_get_tree(), fix it. Cc: stable@kernel.org Fixes: 6781eabba1bd ("f2fs: give -EINVAL for norecovery and rw mount") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0b0ef8ba243b..8cf98c40b160 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -5211,11 +5211,15 @@ try_onemore: } } else { err = f2fs_recover_fsync_data(sbi, true); - - if (!f2fs_readonly(sb) && err > 0) { - err = -EINVAL; - f2fs_err(sbi, "Need to recover fsync data"); - goto free_meta; + if (err > 0) { + if (!f2fs_readonly(sb)) { + f2fs_err(sbi, "Need to recover fsync data"); + err = -EINVAL; + goto free_meta; + } else { + f2fs_info(sbi, "drop all fsynced data"); + err = 0; + } } } From 68d05693f8c031257a0822464366e1c2a239a512 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 5 Nov 2025 14:50:23 +0800 Subject: [PATCH 26/39] f2fs: fix to detect recoverable inode during dryrun of find_fsync_dnodes() mkfs.f2fs -f /dev/vdd mount /dev/vdd /mnt/f2fs touch /mnt/f2fs/foo sync # avoid CP_UMOUNT_FLAG in last f2fs_checkpoint.ckpt_flags touch /mnt/f2fs/bar f2fs_io fsync /mnt/f2fs/bar f2fs_io shutdown 2 /mnt/f2fs umount /mnt/f2fs blockdev --setro /dev/vdd mount /dev/vdd /mnt/f2fs mount: /mnt/f2fs: WARNING: source write-protected, mounted read-only. For the case if we create and fsync a new inode before sudden power-cut, without norecovery or disable_roll_forward mount option, the following mount will succeed w/o recovering last fsynced inode. The problem here is that we only check inode_list list after find_fsync_dnodes() in f2fs_recover_fsync_data() to find out whether there is recoverable data in the iamge, but there is a missed case, if last fsynced inode is not existing in last checkpoint, then, we will fail to get its inode due to nat of inode node is not existing in last checkpoint, so the inode won't be linked in inode_list. Let's detect such case in dyrun mode to fix this issue. After this change, mount will fail as expected below: mount: /mnt/f2fs: cannot mount /dev/vdd read-only. dmesg(1) may have more information after failed mount system call. demsg: F2FS-fs (vdd): Need to recover fsync data, but write access unavailable, please try mount w/ disable_roll_forward or norecovery Cc: stable@kernel.org Fixes: 6781eabba1bd ("f2fs: give -EINVAL for norecovery and rw mount") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 215e442db72c..d7faebaa3c6b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -399,7 +399,7 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, - bool check_only) + bool check_only, bool *new_inode) { struct curseg_info *curseg; block_t blkaddr, blkaddr_fast; @@ -447,16 +447,19 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, quota_inode = true; } - /* - * CP | dnode(F) | inode(DF) - * For this case, we should not give up now. - */ entry = add_fsync_inode(sbi, head, ino_of_node(folio), quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); - if (err == -ENOENT) + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ + if (err == -ENOENT) { + if (check_only) + *new_inode = true; goto next; + } f2fs_folio_put(folio, true); break; } @@ -875,6 +878,7 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) int ret = 0; unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + bool new_inode = false; f2fs_notice(sbi, "f2fs_recover_fsync_data: recovery fsync data, " "check_only: %d", check_only); @@ -890,8 +894,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) f2fs_down_write(&sbi->cp_global_sem); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list, check_only); - if (err || list_empty(&inode_list)) + err = find_fsync_dnodes(sbi, &inode_list, check_only, &new_inode); + if (err < 0 || (list_empty(&inode_list) && (!check_only || !new_inode))) goto skip; if (check_only) { From 7ee8bc3942f20964ad730871b885688ea3a2961a Mon Sep 17 00:00:00 2001 From: Daeho Jeong Date: Tue, 11 Nov 2025 09:52:46 -0800 Subject: [PATCH 27/39] f2fs: revert summary entry count from 2048 to 512 in 16kb block support The recent increase in the number of Segment Summary Area (SSA) entries from 512 to 2048 was an unintentional change in logic of 16kb block support. This commit corrects the issue. To better utilize the space available from the erroneous 2048-entry calculation, we are implementing a solution to share the currently unused SSA space with neighboring segments. This enhances overall SSA utilization without impacting the established 8MB segment size. Fixes: d7e9a9037de2 ("f2fs: Support Block Size == Page Size") Signed-off-by: Daeho Jeong Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 + fs/f2fs/gc.c | 115 ++++++++++++++++++++++++---------------- fs/f2fs/recovery.c | 2 +- fs/f2fs/segment.c | 38 +++++++++---- fs/f2fs/segment.h | 8 ++- fs/f2fs/super.c | 14 +++++ fs/f2fs/sysfs.c | 7 +++ include/linux/f2fs_fs.h | 5 +- 8 files changed, 129 insertions(+), 62 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d9b2777f09ed..860e9c69d3a6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -245,6 +245,7 @@ struct f2fs_mount_info { #define F2FS_FEATURE_COMPRESSION 0x00002000 #define F2FS_FEATURE_RO 0x00004000 #define F2FS_FEATURE_DEVICE_ALIAS 0x00008000 +#define F2FS_FEATURE_PACKED_SSA 0x00010000 #define __F2FS_HAS_FEATURE(raw_super, mask) \ ((raw_super->feature & cpu_to_le32(mask)) != 0) @@ -4704,6 +4705,7 @@ F2FS_FEATURE_FUNCS(casefold, CASEFOLD); F2FS_FEATURE_FUNCS(compression, COMPRESSION); F2FS_FEATURE_FUNCS(readonly, RO); F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); +F2FS_FEATURE_FUNCS(packed_ssa, PACKED_SSA); #ifdef CONFIG_BLK_DEV_ZONED static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index e04aafee1f2c..384fa7e2085b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1735,7 +1735,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; unsigned char data_type = (type == SUM_TYPE_DATA) ? DATA : NODE; - int submitted = 0; + int submitted = 0, sum_blk_cnt; if (__is_large_section(sbi)) { sec_end_segno = rounddown(end_segno, SEGS_PER_SEC(sbi)); @@ -1769,22 +1769,28 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sanity_check_seg_type(sbi, get_seg_entry(sbi, segno)->type); + segno = rounddown(segno, SUMS_PER_BLOCK); + sum_blk_cnt = DIV_ROUND_UP(end_segno - segno, SUMS_PER_BLOCK); /* readahead multi ssa blocks those have contiguous address */ if (__is_large_section(sbi)) f2fs_ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), - end_segno - segno, META_SSA, true); + sum_blk_cnt, META_SSA, true); /* reference all summary page */ while (segno < end_segno) { - struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno++); + struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno); + + segno += SUMS_PER_BLOCK; if (IS_ERR(sum_folio)) { int err = PTR_ERR(sum_folio); - end_segno = segno - 1; - for (segno = start_segno; segno < end_segno; segno++) { + end_segno = segno - SUMS_PER_BLOCK; + segno = rounddown(start_segno, SUMS_PER_BLOCK); + while (segno < end_segno) { sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); folio_put_refs(sum_folio, 2); + segno += SUMS_PER_BLOCK; } return err; } @@ -1793,68 +1799,83 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, blk_start_plug(&plug); - for (segno = start_segno; segno < end_segno; segno++) { - struct f2fs_summary_block *sum; + segno = start_segno; + while (segno < end_segno) { + unsigned int cur_segno; /* find segment summary of victim */ struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); + unsigned int block_end_segno = rounddown(segno, SUMS_PER_BLOCK) + + SUMS_PER_BLOCK; + + if (block_end_segno > end_segno) + block_end_segno = end_segno; if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { f2fs_err(sbi, "%s: segment %u is used by log", __func__, segno); f2fs_bug_on(sbi, 1); - goto skip; + goto next_block; } - if (get_valid_blocks(sbi, segno, false) == 0) - goto freed; - if (gc_type == BG_GC && __is_large_section(sbi) && - migrated >= sbi->migration_granularity) - goto skip; if (!folio_test_uptodate(sum_folio) || unlikely(f2fs_cp_error(sbi))) - goto skip; + goto next_block; - sum = folio_address(sum_folio); - if (type != GET_SUM_TYPE((&sum->footer))) { - f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA", - segno, type, GET_SUM_TYPE((&sum->footer))); - f2fs_stop_checkpoint(sbi, false, - STOP_CP_REASON_CORRUPTED_SUMMARY); - goto skip; - } + for (cur_segno = segno; cur_segno < block_end_segno; + cur_segno++) { + struct f2fs_summary_block *sum; - /* - * this is to avoid deadlock: - * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - down_write(sentry_lock) - * - down_read(sentry_lock) - change_curseg() - * - lock_page(sum_page) - */ - if (type == SUM_TYPE_NODE) - submitted += gc_node_segment(sbi, sum->entries, segno, - gc_type); - else - submitted += gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type, - force_migrate); + if (get_valid_blocks(sbi, cur_segno, false) == 0) + goto freed; + if (gc_type == BG_GC && __is_large_section(sbi) && + migrated >= sbi->migration_granularity) + continue; - stat_inc_gc_seg_count(sbi, data_type, gc_type); - sbi->gc_reclaimed_segs[sbi->gc_mode]++; - migrated++; + sum = SUM_BLK_PAGE_ADDR(sum_folio, cur_segno); + if (type != GET_SUM_TYPE((&sum->footer))) { + f2fs_err(sbi, "Inconsistent segment (%u) type " + "[%d, %d] in SSA and SIT", + cur_segno, type, + GET_SUM_TYPE((&sum->footer))); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_SUMMARY); + continue; + } + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - down_write(sentry_lock) + * - down_read(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + if (type == SUM_TYPE_NODE) + submitted += gc_node_segment(sbi, sum->entries, + cur_segno, gc_type); + else + submitted += gc_data_segment(sbi, sum->entries, + gc_list, cur_segno, + gc_type, force_migrate); + + stat_inc_gc_seg_count(sbi, data_type, gc_type); + sbi->gc_reclaimed_segs[sbi->gc_mode]++; + migrated++; freed: - if (gc_type == FG_GC && - get_valid_blocks(sbi, segno, false) == 0) - seg_freed++; + if (gc_type == FG_GC && + get_valid_blocks(sbi, cur_segno, false) == 0) + seg_freed++; - if (__is_large_section(sbi)) - sbi->next_victim_seg[gc_type] = - (segno + 1 < sec_end_segno) ? - segno + 1 : NULL_SEGNO; -skip: + if (__is_large_section(sbi)) + sbi->next_victim_seg[gc_type] = + (cur_segno + 1 < sec_end_segno) ? + cur_segno + 1 : NULL_SEGNO; + } +next_block: folio_put_refs(sum_folio, 2); + segno = block_end_segno; } if (submitted) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index d7faebaa3c6b..62a0c71b5b75 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -522,7 +522,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, sum_folio = f2fs_get_sum_folio(sbi, segno); if (IS_ERR(sum_folio)) return PTR_ERR(sum_folio); - sum_node = folio_address(sum_folio); + sum_node = SUM_BLK_PAGE_ADDR(sum_folio, segno); sum = sum_node->entries[blkoff]; f2fs_folio_put(sum_folio, true); got_it: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a473cd1fb37d..10d873d1b328 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2712,7 +2712,15 @@ struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno) void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { - struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); + struct folio *folio; + + if (SUMS_PER_BLOCK == 1) + folio = f2fs_grab_meta_folio(sbi, blk_addr); + else + folio = f2fs_get_meta_folio_retry(sbi, blk_addr); + + if (IS_ERR(folio)) + return; memcpy(folio_address(folio), src, PAGE_SIZE); folio_mark_dirty(folio); @@ -2720,9 +2728,21 @@ void f2fs_update_meta_page(struct f2fs_sb_info *sbi, } static void write_sum_page(struct f2fs_sb_info *sbi, - struct f2fs_summary_block *sum_blk, block_t blk_addr) + struct f2fs_summary_block *sum_blk, unsigned int segno) { - f2fs_update_meta_page(sbi, (void *)sum_blk, blk_addr); + struct folio *folio; + + if (SUMS_PER_BLOCK == 1) + return f2fs_update_meta_page(sbi, (void *)sum_blk, + GET_SUM_BLOCK(sbi, segno)); + + folio = f2fs_get_sum_folio(sbi, segno); + if (IS_ERR(folio)) + return; + + memcpy(SUM_BLK_PAGE_ADDR(folio, segno), sum_blk, sizeof(*sum_blk)); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } static void write_current_sum_page(struct f2fs_sb_info *sbi, @@ -2987,7 +3007,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) int ret; if (curseg->inited) - write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, segno)); + write_sum_page(sbi, curseg->sum_blk, segno); segno = __get_next_segno(sbi, type); ret = get_new_segment(sbi, &segno, new_sec, pinning); @@ -3046,7 +3066,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) struct folio *sum_folio; if (curseg->inited) - write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, curseg->segno); __set_test_and_inuse(sbi, new_segno); @@ -3065,7 +3085,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); return PTR_ERR(sum_folio); } - sum_node = folio_address(sum_folio); + sum_node = SUM_BLK_PAGE_ADDR(sum_folio, new_segno); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); f2fs_folio_put(sum_folio, true); return 0; @@ -3154,8 +3174,7 @@ static void __f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi, int type) goto out; if (get_valid_blocks(sbi, curseg->segno, false)) { - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, curseg->segno); } else { mutex_lock(&DIRTY_I(sbi)->seglist_lock); __set_test_and_free(sbi, curseg->segno, true); @@ -3833,8 +3852,7 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct folio *folio, if (segment_full) { if (type == CURSEG_COLD_DATA_PINNED && !((curseg->segno + 1) % sbi->segs_per_sec)) { - write_sum_page(sbi, curseg->sum_blk, - GET_SUM_BLOCK(sbi, curseg->segno)); + write_sum_page(sbi, curseg->sum_blk, curseg->segno); reset_curseg_fields(curseg); goto skip_new_segment; } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 1ce2c8abaf48..e883f14c228f 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -85,8 +85,12 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define GET_ZONE_FROM_SEG(sbi, segno) \ GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) -#define GET_SUM_BLOCK(sbi, segno) \ - ((sbi)->sm_info->ssa_blkaddr + (segno)) +#define SUMS_PER_BLOCK (F2FS_BLKSIZE / F2FS_SUM_BLKSIZE) +#define GET_SUM_BLOCK(sbi, segno) \ + (SM_I(sbi)->ssa_blkaddr + (segno / SUMS_PER_BLOCK)) +#define GET_SUM_BLKOFF(segno) (segno % SUMS_PER_BLOCK) +#define SUM_BLK_PAGE_ADDR(folio, segno) \ + (folio_address(folio) + GET_SUM_BLKOFF(segno) * F2FS_SUM_BLKSIZE) #define GET_SUM_TYPE(footer) ((footer)->entry_type) #define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 8cf98c40b160..c2161b3469b3 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -4080,6 +4080,20 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, if (sanity_check_area_boundary(sbi, folio, index)) return -EFSCORRUPTED; + /* + * Check for legacy summary layout on 16KB+ block devices. + * Modern f2fs-tools packs multiple 4KB summary areas into one block, + * whereas legacy versions used one block per summary, leading + * to a much larger SSA. + */ + if (SUMS_PER_BLOCK > 1 && + !(__F2FS_HAS_FEATURE(raw_super, F2FS_FEATURE_PACKED_SSA))) { + f2fs_info(sbi, "Error: Device formatted with a legacy version. " + "Please reformat with a tool supporting the packed ssa " + "feature for block sizes larger than 4kb."); + return -EOPNOTSUPP; + } + return 0; } diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 6d2a4fba68a2..5685b454bfd1 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -235,6 +235,9 @@ static ssize_t features_show(struct f2fs_attr *a, if (f2fs_sb_has_compression(sbi)) len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "compression"); + if (f2fs_sb_has_packed_ssa(sbi)) + len += sysfs_emit_at(buf, len, "%s%s", + len ? ", " : "", "packed_ssa"); len += sysfs_emit_at(buf, len, "%s%s", len ? ", " : "", "pin_file"); len += sysfs_emit_at(buf, len, "\n"); @@ -1296,6 +1299,7 @@ F2FS_FEATURE_RO_ATTR(pin_file); #ifdef CONFIG_UNICODE F2FS_FEATURE_RO_ATTR(linear_lookup); #endif +F2FS_FEATURE_RO_ATTR(packed_ssa); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -1455,6 +1459,7 @@ static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_UNICODE BASE_ATTR_LIST(linear_lookup), #endif + BASE_ATTR_LIST(packed_ssa), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1490,6 +1495,7 @@ F2FS_SB_FEATURE_RO_ATTR(casefold, CASEFOLD); F2FS_SB_FEATURE_RO_ATTR(compression, COMPRESSION); F2FS_SB_FEATURE_RO_ATTR(readonly, RO); F2FS_SB_FEATURE_RO_ATTR(device_alias, DEVICE_ALIAS); +F2FS_SB_FEATURE_RO_ATTR(packed_ssa, PACKED_SSA); static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_encryption), @@ -1507,6 +1513,7 @@ static struct attribute *f2fs_sb_feat_attrs[] = { ATTR_LIST(sb_compression), ATTR_LIST(sb_readonly), ATTR_LIST(sb_device_alias), + ATTR_LIST(sb_packed_ssa), NULL, }; ATTRIBUTE_GROUPS(f2fs_sb_feat); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 6afb4a13b81d..a7880787cad3 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -17,6 +17,7 @@ #define F2FS_LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) /* log number for sector/blk */ #define F2FS_BLKSIZE PAGE_SIZE /* support only block == page */ #define F2FS_BLKSIZE_BITS PAGE_SHIFT /* bits for F2FS_BLKSIZE */ +#define F2FS_SUM_BLKSIZE 4096 /* only support 4096 byte sum block */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ #define F2FS_EXTENSION_LEN 8 /* max size of extension */ @@ -441,7 +442,7 @@ struct f2fs_sit_block { * from node's page's beginning to get a data block address. * ex) data_blkaddr = (block_t)(nodepage_start_address + ofs_in_node) */ -#define ENTRIES_IN_SUM (F2FS_BLKSIZE / 8) +#define ENTRIES_IN_SUM (F2FS_SUM_BLKSIZE / 8) #define SUMMARY_SIZE (7) /* sizeof(struct f2fs_summary) */ #define SUM_FOOTER_SIZE (5) /* sizeof(struct summary_footer) */ #define SUM_ENTRY_SIZE (SUMMARY_SIZE * ENTRIES_IN_SUM) @@ -467,7 +468,7 @@ struct summary_footer { __le32 check_sum; /* summary checksum */ } __packed; -#define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\ +#define SUM_JOURNAL_SIZE (F2FS_SUM_BLKSIZE - SUM_FOOTER_SIZE -\ SUM_ENTRY_SIZE) #define NAT_JOURNAL_ENTRIES ((SUM_JOURNAL_SIZE - 2) /\ sizeof(struct nat_journal_entry)) From 24fd7f00161055e1ca0dd137a1d67f87fa781f99 Mon Sep 17 00:00:00 2001 From: Baolin Liu Date: Tue, 11 Nov 2025 20:17:28 +0800 Subject: [PATCH 28/39] f2fs: simplify list initialization in f2fs_recover_fsync_data() In f2fs_recover_fsync_data(),use LIST_HEAD() to declare and initialize the list_head in one step instead of using INIT_LIST_HEAD() separately. No functional change. Signed-off-by: Baolin Liu Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 62a0c71b5b75..c3415ebb9f50 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -872,8 +872,9 @@ next: int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct list_head inode_list, tmp_inode_list; - struct list_head dir_list; + LIST_HEAD(inode_list); + LIST_HEAD(tmp_inode_list); + LIST_HEAD(dir_list); int err; int ret = 0; unsigned long s_flags = sbi->sb->s_flags; @@ -886,10 +887,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) if (is_sbi_flag_set(sbi, SBI_IS_WRITABLE)) f2fs_info(sbi, "recover fsync data on readonly fs"); - INIT_LIST_HEAD(&inode_list); - INIT_LIST_HEAD(&tmp_inode_list); - INIT_LIST_HEAD(&dir_list); - /* prevent checkpoint */ f2fs_down_write(&sbi->cp_global_sem); From 581251e03077f2fb83f9d10f5e21ec7e546a82b4 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Mon, 10 Nov 2025 16:22:21 +0800 Subject: [PATCH 29/39] f2fs: wrap all unusable_blocks_per_sec code in CONFIG_BLK_DEV_ZONED The usage of unusable_blocks_per_sec is already wrapped by CONFIG_BLK_DEV_ZONED, except for its declaration and the definitions of CAP_BLKS_PER_SEC and CAP_SEGS_PER_SEC. This patch ensures that all code related to unusable_blocks_per_sec is properly wrapped under the CONFIG_BLK_DEV_ZONED option. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 860e9c69d3a6..3d8b69628721 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1669,6 +1669,7 @@ struct f2fs_sb_info { #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int max_open_zones; /* max open zone resources of the zoned device */ /* For adjust the priority writing position of data in zone UFS */ unsigned int blkzone_alloc_policy; @@ -1741,7 +1742,6 @@ struct f2fs_sb_info { unsigned int meta_ino_num; /* meta inode number*/ unsigned int log_blocks_per_seg; /* log2 blocks per segment */ unsigned int blocks_per_seg; /* blocks per segment */ - unsigned int unusable_blocks_per_sec; /* unusable blocks per section */ unsigned int segs_per_sec; /* segments per section */ unsigned int secs_per_zone; /* sections per zone */ unsigned int total_sections; /* total section count */ diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e883f14c228f..0b54d87409b0 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -69,11 +69,16 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, ((!__is_valid_data_blkaddr(blk_addr)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) +#ifdef CONFIG_BLK_DEV_ZONED #define CAP_BLKS_PER_SEC(sbi) \ (BLKS_PER_SEC(sbi) - (sbi)->unusable_blocks_per_sec) #define CAP_SEGS_PER_SEC(sbi) \ (SEGS_PER_SEC(sbi) - \ BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec)) +#else +#define CAP_BLKS_PER_SEC(sbi) BLKS_PER_SEC(sbi) +#define CAP_SEGS_PER_SEC(sbi) SEGS_PER_SEC(sbi) +#endif #define GET_START_SEG_FROM_SEC(sbi, segno) \ (rounddown(segno, SEGS_PER_SEC(sbi))) #define GET_SEC_FROM_SEG(sbi, segno) \ From 89732017890e8ce0826e18c743b71dc564d3a674 Mon Sep 17 00:00:00 2001 From: Yongpeng Yang Date: Mon, 10 Nov 2025 16:22:22 +0800 Subject: [PATCH 30/39] f2fs: add a sysfs entry to show max open zones This patch adds a sysfs entry showing the max zones that F2FS can write concurrently. Signed-off-by: Yongpeng Yang Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 6 ++++++ fs/f2fs/sysfs.c | 2 ++ 2 files changed, 8 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index b590809869ca..770470e0598b 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -643,6 +643,12 @@ Contact: "Jaegeuk Kim" Description: Shows the number of unusable blocks in a section which was defined by the zone capacity reported by underlying zoned device. +What: /sys/fs/f2fs//max_open_zones +Date: November 2025 +Contact: "Yongpeng Yang" +Description: Shows the max number of zones that F2FS can write concurrently when a zoned + device is mounted. + What: /sys/fs/f2fs//current_atomic_write Date: July 2022 Contact: "Daeho Jeong" diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index 5685b454bfd1..c42f4f979d13 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -1213,6 +1213,7 @@ F2FS_SBI_GENERAL_RW_ATTR(last_age_weight); F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); #ifdef CONFIG_BLK_DEV_ZONED F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); +F2FS_SBI_GENERAL_RO_ATTR(max_open_zones); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); @@ -1388,6 +1389,7 @@ static struct attribute *f2fs_attrs[] = { #endif #ifdef CONFIG_BLK_DEV_ZONED ATTR_LIST(unusable_blocks_per_sec), + ATTR_LIST(max_open_zones), ATTR_LIST(blkzone_alloc_policy), #endif #ifdef CONFIG_F2FS_FS_COMPRESSION From 30a8496694f1a93328e5d7f19206380346918b5a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Nov 2025 09:47:47 +0800 Subject: [PATCH 31/39] f2fs: use memalloc_retry_wait() as much as possible memalloc_retry_wait() is recommended in memory allocation retry logic, use it as much as possible. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 2 +- fs/f2fs/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 10d873d1b328..d968a4250b1a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -234,7 +234,7 @@ retry: err = f2fs_get_dnode_of_data(&dn, index, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index c2161b3469b3..2bd7c2320d4f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -3139,7 +3139,7 @@ retry: &folio, &fsdata); if (unlikely(err)) { if (err == -ENOMEM) { - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); From 76e780d88c771921ea643fb8a6c8d0b08c17cb7b Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Nov 2025 09:47:48 +0800 Subject: [PATCH 32/39] f2fs: introduce f2fs_schedule_timeout() In f2fs retry logic, we will call f2fs_io_schedule_timeout() to sleep as uninterruptible state (waiting for IO) for a while, however, in several paths below, we are not blocked by IO: - f2fs_write_single_data_page() return -EAGAIN due to racing on cp_rwsem. - f2fs_flush_device_cache() failed to submit preflush command. - __issue_discard_cmd_range() sleeps periodically in between two in batch discard submissions. So, in order to reveal state of task more accurate, let's introduce f2fs_schedule_timeout() and call it in above paths in where we are waiting for non-IO reasons. Then we can get real reason of uninterruptible sleep for a thread in tracepoint, perfetto, etc. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/compress.c | 4 ++-- fs/f2fs/data.c | 4 ++-- fs/f2fs/f2fs.h | 22 +++++++++++++++------- fs/f2fs/segment.c | 4 ++-- fs/f2fs/super.c | 2 +- 6 files changed, 24 insertions(+), 16 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index bbe07e3a6c75..4c401b5b2933 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1318,7 +1318,7 @@ void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type) f2fs_submit_merged_write(sbi, DATA); prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - io_schedule_timeout(DEFAULT_IO_TIMEOUT); + io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); } finish_wait(&sbi->cp_wait, &wait); } @@ -1974,7 +1974,7 @@ void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi) /* Let's wait for the previous dispatched checkpoint. */ while (atomic_read(&cprc->queued_ckpt)) - io_schedule_timeout(DEFAULT_IO_TIMEOUT); + io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); } void f2fs_init_ckpt_req_control(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 716004ba44dc..148bb925b03b 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1057,7 +1057,7 @@ static void cancel_cluster_writeback(struct compress_ctx *cc, f2fs_submit_merged_write(F2FS_I_SB(cc->inode), DATA); while (atomic_read(&cic->pending_pages) != (cc->valid_nr_cpages - submitted + 1)) - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); } /* Cancel writeback and stay locked. */ @@ -1574,7 +1574,7 @@ continue_unlock: */ if (IS_NOQUOTA(cc->inode)) goto out; - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); goto retry_write; } goto out; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7be0837a9456..9d4f46b8c256 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -3141,8 +3141,8 @@ result: } else if (ret == -EAGAIN) { ret = 0; if (wbc->sync_mode == WB_SYNC_ALL) { - f2fs_io_schedule_timeout( - DEFAULT_IO_TIMEOUT); + f2fs_schedule_timeout( + DEFAULT_SCHEDULE_TIMEOUT); goto retry_write; } goto next; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3d8b69628721..858ceb3d2ad6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -656,8 +656,8 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ -/* congestion wait timeout value, default: 20ms */ -#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) +/* IO/non-IO congestion wait timeout value, default: 20ms */ +#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(20)) /* timeout value injected, default: 1000ms */ #define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) @@ -4908,22 +4908,30 @@ static inline bool f2fs_block_unit_discard(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_BLOCK; } -static inline void f2fs_io_schedule_timeout(long timeout) +static inline void __f2fs_schedule_timeout(long timeout, bool io) { set_current_state(TASK_UNINTERRUPTIBLE); - io_schedule_timeout(timeout); + if (io) + io_schedule_timeout(timeout); + else + schedule_timeout(timeout); } +#define f2fs_io_schedule_timeout(timeout) \ + __f2fs_schedule_timeout(timeout, true) +#define f2fs_schedule_timeout(timeout) \ + __f2fs_schedule_timeout(timeout, false) + static inline void f2fs_io_schedule_timeout_killable(long timeout) { while (timeout) { if (fatal_signal_pending(current)) return; set_current_state(TASK_UNINTERRUPTIBLE); - io_schedule_timeout(DEFAULT_IO_TIMEOUT); - if (timeout <= DEFAULT_IO_TIMEOUT) + io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + if (timeout <= DEFAULT_SCHEDULE_TIMEOUT) return; - timeout -= DEFAULT_IO_TIMEOUT; + timeout -= DEFAULT_SCHEDULE_TIMEOUT; } } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index d968a4250b1a..993ec8afe2db 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -750,7 +750,7 @@ int f2fs_flush_device_cache(struct f2fs_sb_info *sbi) do { ret = __submit_flush_wait(sbi, FDEV(i).bdev); if (ret) - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); } while (ret && --count); if (ret) { @@ -3471,7 +3471,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); goto next; } skip: diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2bd7c2320d4f..d47ec718f3be 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -2652,7 +2652,7 @@ static int f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* we should flush all the data to keep data consistency */ while (get_pages(sbi, F2FS_DIRTY_DATA)) { writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); - f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); + f2fs_io_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); if (f2fs_time_over(sbi, ENABLE_TIME)) break; From d31e0de8b8625874d2fc4f5506b3bf30610555a0 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Nov 2025 09:47:49 +0800 Subject: [PATCH 33/39] f2fs: change default schedule timeout value This patch changes default schedule timeout value from 20ms to 1ms, in order to give caller more chances to check whether IO or non-IO congestion condition has already been mitigable. In addition, default interval of periodical discard submission is kept to 20ms. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 6 ++++-- fs/f2fs/segment.c | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 858ceb3d2ad6..842dd1963b57 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -407,6 +407,8 @@ struct discard_entry { #define DEFAULT_DISCARD_GRANULARITY 16 /* default maximum discard granularity of ordered discard, unit: block count */ #define DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY 16 +/* default interval of periodical discard submission */ +#define DEFAULT_DISCARD_INTERVAL (msecs_to_jiffies(20)) /* max discard pend list number */ #define MAX_PLIST_NUM 512 @@ -656,8 +658,8 @@ enum { #define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO or flush count */ -/* IO/non-IO congestion wait timeout value, default: 20ms */ -#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(20)) +/* IO/non-IO congestion wait timeout value, default: 1ms */ +#define DEFAULT_SCHEDULE_TIMEOUT (msecs_to_jiffies(1)) /* timeout value injected, default: 1000ms */ #define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 993ec8afe2db..8375dca7ed9e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -3471,7 +3471,7 @@ next: blk_finish_plug(&plug); mutex_unlock(&dcc->cmd_lock); trimmed += __wait_all_discard_cmd(sbi, NULL); - f2fs_schedule_timeout(DEFAULT_SCHEDULE_TIMEOUT); + f2fs_schedule_timeout(DEFAULT_DISCARD_INTERVAL); goto next; } skip: From 1627a303bca692edc6552630aa2f878c8a726a01 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 17 Nov 2025 20:45:59 +0800 Subject: [PATCH 34/39] f2fs: expand scalability of f2fs mount option opt field in structure f2fs_mount_info and opt_mask field in structure f2fs_fs_context is 32-bits variable, now we're running out of available bits in them, let's expand them to 64-bits for better scalability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 85 ++++++++++++++++++++++++++----------------------- fs/f2fs/super.c | 36 ++++++++++----------- 2 files changed, 63 insertions(+), 58 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 842dd1963b57..55d29d50159e 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -96,47 +96,52 @@ extern const char *f2fs_fault_name[FAULT_MAX]; /* * For mount options */ -#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000001 -#define F2FS_MOUNT_DISCARD 0x00000002 -#define F2FS_MOUNT_NOHEAP 0x00000004 -#define F2FS_MOUNT_XATTR_USER 0x00000008 -#define F2FS_MOUNT_POSIX_ACL 0x00000010 -#define F2FS_MOUNT_DISABLE_EXT_IDENTIFY 0x00000020 -#define F2FS_MOUNT_INLINE_XATTR 0x00000040 -#define F2FS_MOUNT_INLINE_DATA 0x00000080 -#define F2FS_MOUNT_INLINE_DENTRY 0x00000100 -#define F2FS_MOUNT_FLUSH_MERGE 0x00000200 -#define F2FS_MOUNT_NOBARRIER 0x00000400 -#define F2FS_MOUNT_FASTBOOT 0x00000800 -#define F2FS_MOUNT_READ_EXTENT_CACHE 0x00001000 -#define F2FS_MOUNT_DATA_FLUSH 0x00002000 -#define F2FS_MOUNT_FAULT_INJECTION 0x00004000 -#define F2FS_MOUNT_USRQUOTA 0x00008000 -#define F2FS_MOUNT_GRPQUOTA 0x00010000 -#define F2FS_MOUNT_PRJQUOTA 0x00020000 -#define F2FS_MOUNT_QUOTA 0x00040000 -#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00080000 -#define F2FS_MOUNT_RESERVE_ROOT 0x00100000 -#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x00200000 -#define F2FS_MOUNT_NORECOVERY 0x00400000 -#define F2FS_MOUNT_ATGC 0x00800000 -#define F2FS_MOUNT_MERGE_CHECKPOINT 0x01000000 -#define F2FS_MOUNT_GC_MERGE 0x02000000 -#define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 -#define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 -#define F2FS_MOUNT_NAT_BITS 0x10000000 -#define F2FS_MOUNT_INLINECRYPT 0x20000000 -/* - * Some f2fs environments expect to be able to pass the "lazytime" option - * string rather than using the MS_LAZYTIME flag, so this must remain. - */ -#define F2FS_MOUNT_LAZYTIME 0x40000000 -#define F2FS_MOUNT_RESERVE_NODE 0x80000000 +enum f2fs_mount_opt { + F2FS_MOUNT_DISABLE_ROLL_FORWARD, + F2FS_MOUNT_DISCARD, + F2FS_MOUNT_NOHEAP, + F2FS_MOUNT_XATTR_USER, + F2FS_MOUNT_POSIX_ACL, + F2FS_MOUNT_DISABLE_EXT_IDENTIFY, + F2FS_MOUNT_INLINE_XATTR, + F2FS_MOUNT_INLINE_DATA, + F2FS_MOUNT_INLINE_DENTRY, + F2FS_MOUNT_FLUSH_MERGE, + F2FS_MOUNT_NOBARRIER, + F2FS_MOUNT_FASTBOOT, + F2FS_MOUNT_READ_EXTENT_CACHE, + F2FS_MOUNT_DATA_FLUSH, + F2FS_MOUNT_FAULT_INJECTION, + F2FS_MOUNT_USRQUOTA, + F2FS_MOUNT_GRPQUOTA, + F2FS_MOUNT_PRJQUOTA, + F2FS_MOUNT_QUOTA, + F2FS_MOUNT_INLINE_XATTR_SIZE, + F2FS_MOUNT_RESERVE_ROOT, + F2FS_MOUNT_DISABLE_CHECKPOINT, + F2FS_MOUNT_NORECOVERY, + F2FS_MOUNT_ATGC, + F2FS_MOUNT_MERGE_CHECKPOINT, + F2FS_MOUNT_GC_MERGE, + F2FS_MOUNT_COMPRESS_CACHE, + F2FS_MOUNT_AGE_EXTENT_CACHE, + F2FS_MOUNT_NAT_BITS, + F2FS_MOUNT_INLINECRYPT, + /* + * Some f2fs environments expect to be able to pass the "lazytime" option + * string rather than using the MS_LAZYTIME flag, so this must remain. + */ + F2FS_MOUNT_LAZYTIME, + F2FS_MOUNT_RESERVE_NODE, +}; #define F2FS_OPTION(sbi) ((sbi)->mount_opt) -#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (F2FS_OPTION(sbi).opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (F2FS_OPTION(sbi).opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) \ + (F2FS_OPTION(sbi).opt &= ~BIT(F2FS_MOUNT_##option)) +#define set_opt(sbi, option) \ + (F2FS_OPTION(sbi).opt |= BIT(F2FS_MOUNT_##option)) +#define test_opt(sbi, option) \ + (F2FS_OPTION(sbi).opt & BIT(F2FS_MOUNT_##option)) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -183,7 +188,7 @@ struct f2fs_rwsem { }; struct f2fs_mount_info { - unsigned int opt; + unsigned long long opt; block_t root_reserved_blocks; /* root reserved blocks */ block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d47ec718f3be..ccb477086444 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -352,7 +352,7 @@ static match_table_t f2fs_checkpoint_tokens = { struct f2fs_fs_context { struct f2fs_mount_info info; - unsigned int opt_mask; /* Bits changed */ + unsigned long long opt_mask; /* Bits changed */ unsigned int spec_mask; unsigned short qname_mask; }; @@ -360,23 +360,23 @@ struct f2fs_fs_context { #define F2FS_CTX_INFO(ctx) ((ctx)->info) static inline void ctx_set_opt(struct f2fs_fs_context *ctx, - unsigned int flag) + enum f2fs_mount_opt flag) { - ctx->info.opt |= flag; - ctx->opt_mask |= flag; + ctx->info.opt |= BIT(flag); + ctx->opt_mask |= BIT(flag); } static inline void ctx_clear_opt(struct f2fs_fs_context *ctx, - unsigned int flag) + enum f2fs_mount_opt flag) { - ctx->info.opt &= ~flag; - ctx->opt_mask |= flag; + ctx->info.opt &= ~BIT(flag); + ctx->opt_mask |= BIT(flag); } static inline bool ctx_test_opt(struct f2fs_fs_context *ctx, - unsigned int flag) + enum f2fs_mount_opt flag) { - return ctx->info.opt & flag; + return ctx->info.opt & BIT(flag); } void f2fs_printk(struct f2fs_sb_info *sbi, bool limit_rate, @@ -1371,7 +1371,7 @@ static int f2fs_check_compression(struct fs_context *fc, ctx_test_opt(ctx, F2FS_MOUNT_COMPRESS_CACHE)) f2fs_info(sbi, "Image doesn't support compression"); clear_compression_spec(ctx); - ctx->opt_mask &= ~F2FS_MOUNT_COMPRESS_CACHE; + ctx->opt_mask &= ~BIT(F2FS_MOUNT_COMPRESS_CACHE); return 0; } if (ctx->spec_mask & F2FS_SPEC_compress_extension) { @@ -1439,42 +1439,42 @@ static int f2fs_check_opt_consistency(struct fs_context *fc, return -EINVAL; if (f2fs_hw_should_discard(sbi) && - (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + (ctx->opt_mask & BIT(F2FS_MOUNT_DISCARD)) && !ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { f2fs_warn(sbi, "discard is required for zoned block devices"); return -EINVAL; } if (!f2fs_hw_support_discard(sbi) && - (ctx->opt_mask & F2FS_MOUNT_DISCARD) && + (ctx->opt_mask & BIT(F2FS_MOUNT_DISCARD)) && ctx_test_opt(ctx, F2FS_MOUNT_DISCARD)) { f2fs_warn(sbi, "device does not support discard"); ctx_clear_opt(ctx, F2FS_MOUNT_DISCARD); - ctx->opt_mask &= ~F2FS_MOUNT_DISCARD; + ctx->opt_mask &= ~BIT(F2FS_MOUNT_DISCARD); } if (f2fs_sb_has_device_alias(sbi) && - (ctx->opt_mask & F2FS_MOUNT_READ_EXTENT_CACHE) && + (ctx->opt_mask & BIT(F2FS_MOUNT_READ_EXTENT_CACHE)) && !ctx_test_opt(ctx, F2FS_MOUNT_READ_EXTENT_CACHE)) { f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } if (test_opt(sbi, RESERVE_ROOT) && - (ctx->opt_mask & F2FS_MOUNT_RESERVE_ROOT) && + (ctx->opt_mask & BIT(F2FS_MOUNT_RESERVE_ROOT)) && ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_ROOT)) { f2fs_info(sbi, "Preserve previous reserve_root=%u", F2FS_OPTION(sbi).root_reserved_blocks); ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); - ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; + ctx->opt_mask &= ~BIT(F2FS_MOUNT_RESERVE_ROOT); } if (test_opt(sbi, RESERVE_NODE) && - (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) && + (ctx->opt_mask & BIT(F2FS_MOUNT_RESERVE_NODE)) && ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) { f2fs_info(sbi, "Preserve previous reserve_node=%u", F2FS_OPTION(sbi).root_reserved_nodes); ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE); - ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE; + ctx->opt_mask &= ~BIT(F2FS_MOUNT_RESERVE_NODE); } err = f2fs_check_test_dummy_encryption(fc, sb); From fbc0774b6d55722c90a4509ec8089071b9e7aa18 Mon Sep 17 00:00:00 2001 From: Masaharu Noguchi Date: Mon, 17 Nov 2025 21:27:54 +0900 Subject: [PATCH 35/39] docs: f2fs: wrap ASCII tables in literal blocks to fix LaTeX build Sphinx's LaTeX builder fails when converting the nested ASCII tables in f2fs.rst, producing the following error: "Markup is unsupported in LaTeX: longtable does not support nesting a table." Wrap the affected ASCII tables in literal code blocks to force Sphinx to render them verbatim. This prevents nested longtables and fixes the PDF build failure on Sphinx 8.2.x. Acked-by: Bagas Sanjaya Reviewed-by: Akira Yokosawa Signed-off-by: Masaharu Noguchi Acked-by: Jonathan Corbet Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.rst | 129 +++++++++++++++-------------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index a8d02fe5be83..cb90d1ae82d0 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -188,34 +188,36 @@ fault_type=%d Support configuring fault injection type, should be enabled with fault_injection option, fault type value is shown below, it supports single or combined type. - =========================== ========== - Type_Name Type_Value - =========================== ========== - FAULT_KMALLOC 0x00000001 - FAULT_KVMALLOC 0x00000002 - FAULT_PAGE_ALLOC 0x00000004 - FAULT_PAGE_GET 0x00000008 - FAULT_ALLOC_BIO 0x00000010 (obsolete) - FAULT_ALLOC_NID 0x00000020 - FAULT_ORPHAN 0x00000040 - FAULT_BLOCK 0x00000080 - FAULT_DIR_DEPTH 0x00000100 - FAULT_EVICT_INODE 0x00000200 - FAULT_TRUNCATE 0x00000400 - FAULT_READ_IO 0x00000800 - FAULT_CHECKPOINT 0x00001000 - FAULT_DISCARD 0x00002000 - FAULT_WRITE_IO 0x00004000 - FAULT_SLAB_ALLOC 0x00008000 - FAULT_DQUOT_INIT 0x00010000 - FAULT_LOCK_OP 0x00020000 - FAULT_BLKADDR_VALIDITY 0x00040000 - FAULT_BLKADDR_CONSISTENCE 0x00080000 - FAULT_NO_SEGMENT 0x00100000 - FAULT_INCONSISTENT_FOOTER 0x00200000 - FAULT_TIMEOUT 0x00400000 (1000ms) - FAULT_VMALLOC 0x00800000 - =========================== ========== + .. code-block:: none + + =========================== ========== + Type_Name Type_Value + =========================== ========== + FAULT_KMALLOC 0x00000001 + FAULT_KVMALLOC 0x00000002 + FAULT_PAGE_ALLOC 0x00000004 + FAULT_PAGE_GET 0x00000008 + FAULT_ALLOC_BIO 0x00000010 (obsolete) + FAULT_ALLOC_NID 0x00000020 + FAULT_ORPHAN 0x00000040 + FAULT_BLOCK 0x00000080 + FAULT_DIR_DEPTH 0x00000100 + FAULT_EVICT_INODE 0x00000200 + FAULT_TRUNCATE 0x00000400 + FAULT_READ_IO 0x00000800 + FAULT_CHECKPOINT 0x00001000 + FAULT_DISCARD 0x00002000 + FAULT_WRITE_IO 0x00004000 + FAULT_SLAB_ALLOC 0x00008000 + FAULT_DQUOT_INIT 0x00010000 + FAULT_LOCK_OP 0x00020000 + FAULT_BLKADDR_VALIDITY 0x00040000 + FAULT_BLKADDR_CONSISTENCE 0x00080000 + FAULT_NO_SEGMENT 0x00100000 + FAULT_INCONSISTENT_FOOTER 0x00200000 + FAULT_TIMEOUT 0x00400000 (1000ms) + FAULT_VMALLOC 0x00800000 + =========================== ========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. @@ -296,14 +298,15 @@ nocheckpoint_merge Disable checkpoint merge feature. compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo", "lz4", "zstd" and "lzo-rle" algorithm. compress_algorithm=%s:%d Control compress algorithm and its compress level, now, only - "lz4" and "zstd" support compress level config. + "lz4" and "zstd" support compress level config:: + + ========= =========== + algorithm level range + ========= =========== + lz4 3 - 16 + zstd 1 - 22 + ========= =========== - ========= =========== - algorithm level range - ========= =========== - lz4 3 - 16 - zstd 1 - 22 - ========= =========== compress_log_size=%u Support configuring compress cluster size. The size will be 4KB * (1 << %u). The default and minimum sizes are 16KB. compress_extension=%s Support adding specified extension, so that f2fs can enable @@ -368,38 +371,42 @@ errors=%s Specify f2fs behavior on critical errors. This supports modes: the partition in read-only mode. By default it uses "continue" mode. - ====================== =============== =============== ======== - mode continue remount-ro panic - ====================== =============== =============== ======== - access ops normal normal N/A - syscall errors -EIO -EROFS N/A - mount option rw ro N/A - pending dir write keep keep N/A - pending non-dir write drop keep N/A - pending node write drop keep N/A - pending meta write keep keep N/A - ====================== =============== =============== ======== + .. code-block:: none + + ====================== =============== =============== ======== + mode continue remount-ro panic + ====================== =============== =============== ======== + access ops normal normal N/A + syscall errors -EIO -EROFS N/A + mount option rw ro N/A + pending dir write keep keep N/A + pending non-dir write drop keep N/A + pending node write drop keep N/A + pending meta write keep keep N/A + ====================== =============== =============== ======== nat_bits Enable nat_bits feature to enhance full/empty nat blocks access, by default it's disabled. lookup_mode=%s Control the directory lookup behavior for casefolded directories. This option has no effect on directories that do not have the casefold feature enabled. - ================== ======================================== - Value Description - ================== ======================================== - perf (Default) Enforces a hash-only lookup. - The linear search fallback is always - disabled, ignoring the on-disk flag. - compat Enables the linear search fallback for - compatibility with directory entries - created by older kernel that used a - different case-folding algorithm. - This mode ignores the on-disk flag. - auto F2FS determines the mode based on the - on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL` - flag. - ================== ======================================== + .. code-block:: none + + ================== ======================================== + Value Description + ================== ======================================== + perf (Default) Enforces a hash-only lookup. + The linear search fallback is always + disabled, ignoring the on-disk flag. + compat Enables the linear search fallback for + compatibility with directory entries + created by older kernel that used a + different case-folding algorithm. + This mode ignores the on-disk flag. + auto F2FS determines the mode based on the + on-disk `SB_ENC_NO_COMPAT_FALLBACK_FL` + flag. + ================== ======================================== ======================== ============================================================ Debugfs Entries From 8f11fe52fc1fa39ccfaa7c1e256f53e35d2839fa Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Nov 2025 17:25:34 +0800 Subject: [PATCH 36/39] f2fs: support to show curseg.next_blkoff in debugfs cat /sys/kernel/debug/f2fs/status Main area: 17 segs, 17 secs 17 zones TYPE blkoff segno secno zoneno dirty_seg full_seg valid_blk - COLD data: 0 4 4 4 0 0 0 - WARM data: 0 7 7 7 0 0 0 - HOT data: 1 5 5 5 2 0 512 - Dir dnode: 3 0 0 0 1 0 2 - File dnode: 0 1 1 1 0 0 0 - Indir nodes: 0 2 2 2 0 0 0 - Pinned file: 0 -1 -1 -1 - ATGC data: 0 -1 -1 -1 Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 29 +++++++++++++++++++---------- fs/f2fs/f2fs.h | 1 + 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 43a83bbd3bc5..032683835569 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -251,6 +251,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + si->blkoff[i] = curseg->next_blkoff; si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); @@ -508,55 +509,63 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); - seq_printf(s, " TYPE %8s %8s %8s %10s %10s %10s\n", - "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk"); - seq_printf(s, " - COLD data: %8d %8d %8d %10u %10u %10u\n", + seq_printf(s, " TYPE %8s %8s %8s %8s %10s %10s %10s\n", + "blkoff", "segno", "secno", "zoneno", "dirty_seg", "full_seg", "valid_blk"); + seq_printf(s, " - COLD data: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_COLD_DATA], si->curseg[CURSEG_COLD_DATA], si->cursec[CURSEG_COLD_DATA], si->curzone[CURSEG_COLD_DATA], si->dirty_seg[CURSEG_COLD_DATA], si->full_seg[CURSEG_COLD_DATA], si->valid_blks[CURSEG_COLD_DATA]); - seq_printf(s, " - WARM data: %8d %8d %8d %10u %10u %10u\n", + seq_printf(s, " - WARM data: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_WARM_DATA], si->curseg[CURSEG_WARM_DATA], si->cursec[CURSEG_WARM_DATA], si->curzone[CURSEG_WARM_DATA], si->dirty_seg[CURSEG_WARM_DATA], si->full_seg[CURSEG_WARM_DATA], si->valid_blks[CURSEG_WARM_DATA]); - seq_printf(s, " - HOT data: %8d %8d %8d %10u %10u %10u\n", + seq_printf(s, " - HOT data: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_HOT_DATA], si->curseg[CURSEG_HOT_DATA], si->cursec[CURSEG_HOT_DATA], si->curzone[CURSEG_HOT_DATA], si->dirty_seg[CURSEG_HOT_DATA], si->full_seg[CURSEG_HOT_DATA], si->valid_blks[CURSEG_HOT_DATA]); - seq_printf(s, " - Dir dnode: %8d %8d %8d %10u %10u %10u\n", + seq_printf(s, " - Dir dnode: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_HOT_NODE], si->curseg[CURSEG_HOT_NODE], si->cursec[CURSEG_HOT_NODE], si->curzone[CURSEG_HOT_NODE], si->dirty_seg[CURSEG_HOT_NODE], si->full_seg[CURSEG_HOT_NODE], si->valid_blks[CURSEG_HOT_NODE]); - seq_printf(s, " - File dnode: %8d %8d %8d %10u %10u %10u\n", + seq_printf(s, " - File dnode: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_WARM_NODE], si->curseg[CURSEG_WARM_NODE], si->cursec[CURSEG_WARM_NODE], si->curzone[CURSEG_WARM_NODE], si->dirty_seg[CURSEG_WARM_NODE], si->full_seg[CURSEG_WARM_NODE], si->valid_blks[CURSEG_WARM_NODE]); - seq_printf(s, " - Indir nodes: %8d %8d %8d %10u %10u %10u\n", + seq_printf(s, " - Indir nodes: %8d %8d %8d %8d %10u %10u %10u\n", + si->blkoff[CURSEG_COLD_NODE], si->curseg[CURSEG_COLD_NODE], si->cursec[CURSEG_COLD_NODE], si->curzone[CURSEG_COLD_NODE], si->dirty_seg[CURSEG_COLD_NODE], si->full_seg[CURSEG_COLD_NODE], si->valid_blks[CURSEG_COLD_NODE]); - seq_printf(s, " - Pinned file: %8d %8d %8d\n", + seq_printf(s, " - Pinned file: %8d %8d %8d %8d\n", + si->blkoff[CURSEG_COLD_DATA_PINNED], si->curseg[CURSEG_COLD_DATA_PINNED], si->cursec[CURSEG_COLD_DATA_PINNED], si->curzone[CURSEG_COLD_DATA_PINNED]); - seq_printf(s, " - ATGC data: %8d %8d %8d\n", + seq_printf(s, " - ATGC data: %8d %8d %8d %8d\n", + si->blkoff[CURSEG_ALL_DATA_ATGC], si->curseg[CURSEG_ALL_DATA_ATGC], si->cursec[CURSEG_ALL_DATA_ATGC], si->curzone[CURSEG_ALL_DATA_ATGC]); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 55d29d50159e..007195a1d4eb 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -4188,6 +4188,7 @@ struct f2fs_stat_info { int gc_secs[2][2]; int tot_blks, data_blks, node_blks; int bg_data_blks, bg_node_blks; + int blkoff[NR_CURSEG_TYPE]; int curseg[NR_CURSEG_TYPE]; int cursec[NR_CURSEG_TYPE]; int curzone[NR_CURSEG_TYPE]; From 37345eae9deaa2e4f372eeb98f6594cd0ee0916e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 28 Nov 2025 17:25:07 +0800 Subject: [PATCH 37/39] f2fs: fix to not account invalid blocks in get_left_section_blocks() w/ LFS mode, in get_left_section_blocks(), we should not account the blocks which were used before and now are invalided, otherwise those blocks will be counted as freed one in has_curseg_enough_space(), result in missing to trigger GC in time. Cc: stable@kernel.org Fixes: 249ad438e1d9 ("f2fs: add a method for calculating the remaining blocks in the current segment in LFS mode.") Fixes: bf34c93d2645 ("f2fs: check curseg space before foreground GC") Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 0b54d87409b0..07dcbcbeb7c6 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -612,10 +612,12 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, enum log_type type, unsigned int segno) { - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) - return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi, - (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + if (f2fs_lfs_mode(sbi)) { + unsigned int used_blocks = __is_large_section(sbi) ? SEGS_TO_BLKS(sbi, + (segno - GET_START_SEG_FROM_SEC(sbi, segno))) : 0; + return CAP_BLKS_PER_SEC(sbi) - used_blocks - CURSEG_I(sbi, type)->next_blkoff; + } return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); } From 8d1cb17aca466b361cca17834b8bb1cf3e3d1818 Mon Sep 17 00:00:00 2001 From: YH Lin Date: Fri, 28 Nov 2025 11:23:57 +0800 Subject: [PATCH 38/39] f2fs: optimize trace_f2fs_write_checkpoint with enums This patch optimizes the tracepoint by replacing these hardcoded strings with a new enumeration f2fs_cp_phase. 1.Defines enum f2fs_cp_phase with values for each checkpoint phase. 2.Updates trace_f2fs_write_checkpoint to accept a u16 phase argument instead of a string pointer. 3.Uses __print_symbolic in TP_printk to convert the enum values back to their corresponding strings for human-readable trace output. This change reduces the storage overhead for each trace event by replacing a variable-length string with a 2-byte integer, while maintaining the same readable output in ftrace. Signed-off-by: YH Lin Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++--- fs/f2fs/f2fs.h | 6 ++++++ include/trace/events/f2fs.h | 19 ++++++++++++++----- 3 files changed, 23 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4c401b5b2933..300664269eb6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1673,7 +1673,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) goto out; } - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_START_BLOCK_OPS); err = block_operations(sbi); if (err) @@ -1681,7 +1681,7 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) stat_cp_time(cpc, CP_TIME_OP_LOCK); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_BLOCK_OPS); f2fs_flush_merged_writes(sbi); @@ -1747,7 +1747,7 @@ stop: /* update CP_TIME to trigger checkpoint periodically */ f2fs_update_time(sbi, CP_TIME); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, CP_PHASE_FINISH_CHECKPOINT); out: if (cpc->reason != CP_RESIZE) f2fs_up_write(&sbi->cp_global_sem); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 007195a1d4eb..20edbb99b814 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -319,6 +319,12 @@ struct cp_control { struct cp_stats stats; }; +enum f2fs_cp_phase { + CP_PHASE_START_BLOCK_OPS, + CP_PHASE_FINISH_BLOCK_OPS, + CP_PHASE_FINISH_CHECKPOINT, +}; + /* * indicate meta/data type */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index e00611ead024..df4017dcc701 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -50,6 +50,9 @@ TRACE_DEFINE_ENUM(CP_PAUSE); TRACE_DEFINE_ENUM(CP_RESIZE); TRACE_DEFINE_ENUM(EX_READ); TRACE_DEFINE_ENUM(EX_BLOCK_AGE); +TRACE_DEFINE_ENUM(CP_PHASE_START_BLOCK_OPS); +TRACE_DEFINE_ENUM(CP_PHASE_FINISH_BLOCK_OPS); +TRACE_DEFINE_ENUM(CP_PHASE_FINISH_CHECKPOINT); #define show_block_type(type) \ __print_symbolic(type, \ @@ -175,6 +178,12 @@ TRACE_DEFINE_ENUM(EX_BLOCK_AGE); #define S_ALL_PERM (S_ISUID | S_ISGID | S_ISVTX | \ S_IRWXU | S_IRWXG | S_IRWXO) +#define show_cp_phase(phase) \ + __print_symbolic(phase, \ + { CP_PHASE_START_BLOCK_OPS, "start block_ops" }, \ + { CP_PHASE_FINISH_BLOCK_OPS, "finish block_ops" }, \ + { CP_PHASE_FINISH_CHECKPOINT, "finish checkpoint" }) + struct f2fs_sb_info; struct f2fs_io_info; struct extent_info; @@ -1573,26 +1582,26 @@ TRACE_EVENT(f2fs_readpages, TRACE_EVENT(f2fs_write_checkpoint, - TP_PROTO(struct super_block *sb, int reason, const char *msg), + TP_PROTO(struct super_block *sb, int reason, u16 phase), - TP_ARGS(sb, reason, msg), + TP_ARGS(sb, reason, phase), TP_STRUCT__entry( __field(dev_t, dev) __field(int, reason) - __string(dest_msg, msg) + __field(u16, phase) ), TP_fast_assign( __entry->dev = sb->s_dev; __entry->reason = reason; - __assign_str(dest_msg); + __entry->phase = phase; ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", show_dev(__entry->dev), show_cpreason(__entry->reason), - __get_str(dest_msg)) + show_cp_phase(__entry->phase)) ); DECLARE_EVENT_CLASS(f2fs_discard, From 76ee7fd6af6851ef78016139bd727057ba467c4e Mon Sep 17 00:00:00 2001 From: Chaitanya Kulkarni Date: Mon, 24 Nov 2025 15:48:05 -0800 Subject: [PATCH 39/39] f2fs: ignore discard return value __blkdev_issue_discard() always returns 0, making the error assignment in __submit_discard_cmd() dead code. Initialize err to 0 and remove the error assignment from the __blkdev_issue_discard() call to err. Move fault injection code into already present if branch where err is set to -EIO. This preserves the fault injection behavior while removing dead error handling. Reviewed-by: Martin K. Petersen Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Chao Yu Signed-off-by: Chaitanya Kulkarni Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8375dca7ed9e..c26424f47686 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1343,15 +1343,9 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, dc->di.len += len; + err = 0; if (time_to_inject(sbi, FAULT_DISCARD)) { err = -EIO; - } else { - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(start), - SECTOR_FROM_BLOCK(len), - GFP_NOFS, &bio); - } - if (err) { spin_lock_irqsave(&dc->lock, flags); if (dc->state == D_PARTIAL) dc->state = D_SUBMIT; @@ -1360,6 +1354,8 @@ static int __submit_discard_cmd(struct f2fs_sb_info *sbi, break; } + __blkdev_issue_discard(bdev, SECTOR_FROM_BLOCK(start), + SECTOR_FROM_BLOCK(len), GFP_NOFS, &bio); f2fs_bug_on(sbi, !bio); /*