diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index ede184b6eda1..5e75438e0b73 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -112,8 +112,6 @@ config BTRFS_EXPERIMENTAL - large folio and block size (> page size) support - - shutdown ioctl and auto-degradation support - - asynchronous checksum generation for data writes - remap-tree - logical address remapping tree diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 31b00b932588..b15122aa26f9 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -45,3 +45,7 @@ btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/free-space-tree-tests.o tests/extent-map-tests.o \ tests/raid-stripe-tree-tests.o tests/delayed-refs-tests.o \ tests/chunk-allocation-tests.o + +ifeq ($(CONFIG_BLK_DEV_ZONED),y) +btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/zoned-tests.o +endif diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 0428557fd77b..273924ca912c 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -858,11 +858,6 @@ static int add_missing_keys(struct btrfs_fs_info *fs_info, free_pref(ref); return PTR_ERR(eb); } - if (unlikely(!extent_buffer_uptodate(eb))) { - free_pref(ref); - free_extent_buffer(eb); - return -EIO; - } if (lock) btrfs_tree_read_lock(eb); @@ -1620,11 +1615,6 @@ again: ret = PTR_ERR(eb); goto out; } - if (unlikely(!extent_buffer_uptodate(eb))) { - free_extent_buffer(eb); - ret = -EIO; - goto out; - } if (!path->skip_locking) btrfs_tree_read_lock(eb); diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 2a2a21aec817..cc0bd03048ba 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -4,6 +4,7 @@ * Copyright (C) 2022 Christoph Hellwig. */ +#include #include #include "bio.h" #include "ctree.h" @@ -350,11 +351,18 @@ static void btrfs_check_read_bio(struct btrfs_bio *bbio, struct btrfs_device *de static void btrfs_log_dev_io_error(const struct bio *bio, struct btrfs_device *dev) { + blk_status_t sts = bio->bi_status; + if (!dev || !dev->bdev) return; - if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET) + if (unlikely(sts == BLK_STS_OK)) return; - + if (unlikely(sts != BLK_STS_IOERR && sts != BLK_STS_TARGET && + sts != BLK_STS_MEDIUM && sts != BLK_STS_PROTECTION)) { + btrfs_warn_rl(dev->fs_info, "bdev %s unexpected block io error: %d", + btrfs_dev_name(dev), sts); + return; + } if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0d17a369bda..e6f5a17a13e3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -728,7 +728,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; - u64 last = 0; + u64 last = block_group->start; u32 nritems; int ret; bool wakeup = true; @@ -737,7 +737,6 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) if (!path) return -ENOMEM; - last = max_t(u64, block_group->start, BTRFS_SUPER_INFO_OFFSET); extent_root = btrfs_extent_root(fs_info, last); if (unlikely(!extent_root)) { btrfs_err(fs_info, @@ -1613,6 +1612,24 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_lock(&space_info->lock); spin_lock(&block_group->lock); + + if (btrfs_is_zoned(fs_info) && btrfs_is_block_group_used(block_group) && + block_group->zone_unusable >= div_u64(block_group->length, 2)) { + /* + * If the block group has data left, but at least half + * of the block group is zone_unusable, mark it as + * reclaimable before continuing with the next block group. + */ + + spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + + btrfs_mark_bg_to_reclaim(block_group); + + goto next; + } + if (btrfs_is_block_group_used(block_group) || (block_group->ro && !(block_group->flags & BTRFS_BLOCK_GROUP_REMAPPED)) || list_is_singular(&block_group->list) || @@ -1679,7 +1696,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) spin_unlock(&space_info->lock); /* We don't want to force the issue, only flip if it's ok. */ - ret = inc_block_group_ro(block_group, 0); + ret = inc_block_group_ro(block_group, false); up_write(&space_info->groups_sem); if (ret < 0) { ret = 0; @@ -1892,13 +1909,145 @@ static bool should_reclaim_block_group(const struct btrfs_block_group *bg, u64 b return true; } -void btrfs_reclaim_bgs_work(struct work_struct *work) +static int btrfs_reclaim_block_group(struct btrfs_block_group *bg, int *reclaimed) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + struct btrfs_space_info *space_info = bg->space_info; + u64 used; + u64 reserved; + u64 old_total; + int ret = 0; + + /* Don't race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&space_info->lock); + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + return 0; + } + + if (bg->used == 0) { + /* + * It is possible that we trigger relocation on a block + * group as its extents are deleted and it first goes + * below the threshold, then shortly after goes empty. + * + * In this case, relocating it does delete it, but has + * some overhead in relocation specific metadata, looking + * for the non-existent extents and running some extra + * transactions, which we can avoid by using one of the + * other mechanisms for dealing with empty block groups. + */ + if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) + btrfs_mark_bg_unused(bg); + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + return 0; + } + + /* + * The block group might no longer meet the reclaim condition by + * the time we get around to reclaiming it, so to avoid + * reclaiming overly full block_groups, skip reclaiming them. + * + * Since the decision making process also depends on the amount + * being freed, pass in a fake giant value to skip that extra + * check, which is more meaningful when adding to the list in + * the first place. + */ + if (!should_reclaim_block_group(bg, bg->length)) { + spin_unlock(&bg->lock); + spin_unlock(&space_info->lock); + up_write(&space_info->groups_sem); + return 0; + } + + spin_unlock(&bg->lock); + old_total = space_info->total_bytes; + spin_unlock(&space_info->lock); + + /* + * Get out fast, in case we're read-only or unmounting the + * filesystem. It is OK to drop block groups from the list even + * for the read-only case. As we did take the super write lock, + * "mount -o remount,ro" won't happen and read-only filesystem + * means it is forced read-only due to a fatal error. So, it + * never gets back to read-write to let us reclaim again. + */ + if (btrfs_need_cleaner_sleep(fs_info)) { + up_write(&space_info->groups_sem); + return 0; + } + + ret = inc_block_group_ro(bg, false); + up_write(&space_info->groups_sem); + if (ret < 0) + return ret; + + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes delalloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start, false); + if (ret) { + btrfs_dec_block_group_ro(bg); + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + used = 0; + reserved = 0; + spin_lock(&space_info->lock); + space_info->reclaim_errors++; + spin_unlock(&space_info->lock); + } + spin_lock(&space_info->lock); + space_info->reclaim_count++; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; + if (space_info->total_bytes < old_total) + btrfs_set_periodic_reclaim_ready(space_info, true); + spin_unlock(&space_info->lock); + if (!ret) + (*reclaimed)++; + + return ret; +} + +void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit) { - struct btrfs_fs_info *fs_info = - container_of(work, struct btrfs_fs_info, reclaim_bgs_work); struct btrfs_block_group *bg; struct btrfs_space_info *space_info; LIST_HEAD(retry_list); + int reclaimed = 0; if (!btrfs_should_reclaim(fs_info)) return; @@ -1925,10 +2074,7 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) */ list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { - u64 used; - u64 reserved; - u64 old_total; - int ret = 0; + int ret; bg = list_first_entry(&fs_info->reclaim_bgs, struct btrfs_block_group, @@ -1937,126 +2083,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) space_info = bg->space_info; spin_unlock(&fs_info->unused_bgs_lock); + ret = btrfs_reclaim_block_group(bg, &reclaimed); - /* Don't race with allocators so take the groups_sem */ - down_write(&space_info->groups_sem); - - spin_lock(&space_info->lock); - spin_lock(&bg->lock); - if (bg->reserved || bg->pinned || bg->ro) { - /* - * We want to bail if we made new allocations or have - * outstanding allocations in this block group. We do - * the ro check in case balance is currently acting on - * this block group. - */ - spin_unlock(&bg->lock); - spin_unlock(&space_info->lock); - up_write(&space_info->groups_sem); - goto next; - } - if (bg->used == 0) { - /* - * It is possible that we trigger relocation on a block - * group as its extents are deleted and it first goes - * below the threshold, then shortly after goes empty. - * - * In this case, relocating it does delete it, but has - * some overhead in relocation specific metadata, looking - * for the non-existent extents and running some extra - * transactions, which we can avoid by using one of the - * other mechanisms for dealing with empty block groups. - */ - if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) - btrfs_mark_bg_unused(bg); - spin_unlock(&bg->lock); - spin_unlock(&space_info->lock); - up_write(&space_info->groups_sem); - goto next; - - } - /* - * The block group might no longer meet the reclaim condition by - * the time we get around to reclaiming it, so to avoid - * reclaiming overly full block_groups, skip reclaiming them. - * - * Since the decision making process also depends on the amount - * being freed, pass in a fake giant value to skip that extra - * check, which is more meaningful when adding to the list in - * the first place. - */ - if (!should_reclaim_block_group(bg, bg->length)) { - spin_unlock(&bg->lock); - spin_unlock(&space_info->lock); - up_write(&space_info->groups_sem); - goto next; - } - - spin_unlock(&bg->lock); - old_total = space_info->total_bytes; - spin_unlock(&space_info->lock); - - /* - * Get out fast, in case we're read-only or unmounting the - * filesystem. It is OK to drop block groups from the list even - * for the read-only case. As we did take the super write lock, - * "mount -o remount,ro" won't happen and read-only filesystem - * means it is forced read-only due to a fatal error. So, it - * never gets back to read-write to let us reclaim again. - */ - if (btrfs_need_cleaner_sleep(fs_info)) { - up_write(&space_info->groups_sem); - goto next; - } - - ret = inc_block_group_ro(bg, 0); - up_write(&space_info->groups_sem); - if (ret < 0) - goto next; - - /* - * The amount of bytes reclaimed corresponds to the sum of the - * "used" and "reserved" counters. We have set the block group - * to RO above, which prevents reservations from happening but - * we may have existing reservations for which allocation has - * not yet been done - btrfs_update_block_group() was not yet - * called, which is where we will transfer a reserved extent's - * size from the "reserved" counter to the "used" counter - this - * happens when running delayed references. When we relocate the - * chunk below, relocation first flushes delalloc, waits for - * ordered extent completion (which is where we create delayed - * references for data extents) and commits the current - * transaction (which runs delayed references), and only after - * it does the actual work to move extents out of the block - * group. So the reported amount of reclaimed bytes is - * effectively the sum of the 'used' and 'reserved' counters. - */ - spin_lock(&bg->lock); - used = bg->used; - reserved = bg->reserved; - spin_unlock(&bg->lock); - - trace_btrfs_reclaim_block_group(bg); - ret = btrfs_relocate_chunk(fs_info, bg->start, false); - if (ret) { - btrfs_dec_block_group_ro(bg); - btrfs_err(fs_info, "error relocating chunk %llu", - bg->start); - used = 0; - reserved = 0; - spin_lock(&space_info->lock); - space_info->reclaim_errors++; - spin_unlock(&space_info->lock); - } - spin_lock(&space_info->lock); - space_info->reclaim_count++; - space_info->reclaim_bytes += used; - space_info->reclaim_bytes += reserved; - if (space_info->total_bytes < old_total) - btrfs_set_periodic_reclaim_ready(space_info, true); - spin_unlock(&space_info->lock); - -next: if (ret && !READ_ONCE(space_info->periodic_reclaim)) btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); @@ -2074,6 +2102,8 @@ next: if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) goto end; spin_lock(&fs_info->unused_bgs_lock); + if (reclaimed >= limit) + break; } spin_unlock(&fs_info->unused_bgs_lock); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -2084,6 +2114,14 @@ end: btrfs_exclop_finish(fs_info); } +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + + btrfs_reclaim_block_groups(fs_info, -1); +} + void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) { btrfs_reclaim_sweep(fs_info); @@ -2222,7 +2260,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) io_stripe_size = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); - buf = kcalloc(map->num_stripes, sizeof(u64), GFP_NOFS); + buf = kzalloc_objs(u64, map->num_stripes, GFP_NOFS); if (!buf) { ret = -ENOMEM; goto out; @@ -2538,7 +2576,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, btrfs_mark_bg_unused(cache); } } else { - inc_block_group_ro(cache, 1); + inc_block_group_ro(cache, true); } return 0; @@ -2694,11 +2732,11 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_RAID0], list) - inc_block_group_ro(cache, 1); + inc_block_group_ro(cache, true); list_for_each_entry(cache, &space_info->block_groups[BTRFS_RAID_SINGLE], list) - inc_block_group_ro(cache, 1); + inc_block_group_ro(cache, true); } btrfs_init_global_block_rsv(info); @@ -3087,7 +3125,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, */ if (sb_rdonly(fs_info->sb)) { mutex_lock(&fs_info->ro_block_group_mutex); - ret = inc_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, false); mutex_unlock(&fs_info->ro_block_group_mutex); return ret; } @@ -3138,7 +3176,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, } } - ret = inc_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, false); if (!ret) goto out; if (ret == -ETXTBSY) @@ -3165,7 +3203,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, if (ret < 0) goto out; - ret = inc_block_group_ro(cache, 0); + ret = inc_block_group_ro(cache, false); if (ret == -ETXTBSY) goto unlock_out; out: @@ -3305,9 +3343,9 @@ fail: } -static int cache_save_setup(struct btrfs_block_group *block_group, - struct btrfs_trans_handle *trans, - struct btrfs_path *path) +static void cache_save_setup(struct btrfs_block_group *block_group, + struct btrfs_trans_handle *trans, + struct btrfs_path *path) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct inode *inode = NULL; @@ -3319,7 +3357,7 @@ static int cache_save_setup(struct btrfs_block_group *block_group, int ret = 0; if (!btrfs_test_opt(fs_info, SPACE_CACHE)) - return 0; + return; /* * If this block group is smaller than 100 megs don't bother caching the @@ -3329,11 +3367,11 @@ static int cache_save_setup(struct btrfs_block_group *block_group, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); - return 0; + return; } if (TRANS_ABORTED(trans)) - return 0; + return; again: inode = lookup_free_space_inode(block_group, path); if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) { @@ -3343,7 +3381,13 @@ again: } if (IS_ERR(inode)) { - BUG_ON(retries); + if (retries) { + ret = PTR_ERR(inode); + btrfs_err(fs_info, + "failed to lookup free space inode after creation for block group %llu: %d", + block_group->start, ret); + goto out_free; + } retries++; if (block_group->ro) @@ -3414,10 +3458,8 @@ again: * We hit an ENOSPC when setting up the cache in this transaction, just * skip doing the setup, we've already cleared the cache so we're safe. */ - if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) { - ret = -ENOSPC; + if (test_bit(BTRFS_TRANS_CACHE_ENOSPC, &trans->transaction->flags)) goto out_put; - } /* * Try to preallocate enough space based on how big the block group is. @@ -3465,7 +3507,6 @@ out: spin_unlock(&block_group->lock); extent_changeset_free(data_reserved); - return ret; } int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index c03e04292900..0504cb357992 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -350,6 +350,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); +void btrfs_reclaim_block_groups(struct btrfs_fs_info *fs_info, unsigned int limit); void btrfs_reclaim_bgs_work(struct work_struct *work); void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 6064dd00d041..9efb3016ef11 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -541,6 +541,31 @@ try_reserve: BTRFS_RESERVE_NO_FLUSH); if (!ret) return block_rsv; + + /* + * If we are being used for updating a log tree, fail immediately, which + * makes the fsync fallback to a transaction commit. + * + * We don't want to consume from the global block reserve, as that is + * precious space that may be needed to do updates to some trees for + * which we don't reserve space during a transaction commit (update root + * items in the root tree, device stat items in the device tree and + * quota tree updates, see btrfs_init_root_block_rsv()), or to fallback + * to in case we did not reserve enough space to run delayed items, + * delayed references, or anything else we need in order to avoid a + * transaction abort. + * + * We also don't want to do a reservation in flush emergency mode, as + * we end up using metadata that could be critical to allow a + * transaction to complete successfully and therefore increase the + * chances for a transaction abort. + * + * Log trees are an optimization and should never consume from the + * global reserve or be allowed overcommitting metadata. + */ + if (btrfs_root_id(root) == BTRFS_TREE_LOG_OBJECTID) + return ERR_PTR(ret); + /* * If we couldn't reserve metadata bytes try and use some from * the global reserve if its space type is the same as the global diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 85199944c1eb..c5783ac1b646 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -180,7 +180,7 @@ static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_co /* * Common wrappers for page allocation from compression wrappers */ -struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info, gfp_t gfp) { struct folio *folio = NULL; @@ -200,7 +200,7 @@ struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info) return folio; alloc: - return folio_alloc(GFP_NOFS, fs_info->block_min_order); + return folio_alloc(gfp, fs_info->block_min_order); } void btrfs_free_compr_folio(struct folio *folio) @@ -292,7 +292,7 @@ static void end_bbio_compressed_write(struct btrfs_bio *bbio) struct compressed_bio *cb = to_compressed_bio(bbio); struct folio_iter fi; - btrfs_finish_ordered_extent(cb->bbio.ordered, NULL, cb->start, cb->len, + btrfs_finish_ordered_extent(cb->bbio.ordered, cb->start, cb->len, cb->bbio.bio.bi_status == BLK_STS_OK); if (cb->writeback) @@ -330,7 +330,6 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb->start = ordered->file_offset; cb->len = ordered->num_bytes; ASSERT(cb->bbio.bio.bi_iter.bi_size == ordered->disk_num_bytes); - cb->compressed_len = ordered->disk_num_bytes; cb->bbio.bio.bi_iter.bi_sector = ordered->disk_bytenr >> SECTOR_SHIFT; cb->bbio.ordered = ordered; @@ -369,7 +368,8 @@ struct compressed_bio *btrfs_alloc_compressed_write(struct btrfs_inode *inode, static noinline int add_ra_bio_pages(struct inode *inode, u64 compressed_end, struct compressed_bio *cb, - int *memstall, unsigned long *pflags) + int *memstall, unsigned long *pflags, + bool direct_reclaim) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); pgoff_t end_index; @@ -377,6 +377,7 @@ static noinline int add_ra_bio_pages(struct inode *inode, u64 cur = cb->orig_bbio->file_offset + orig_bio->bi_iter.bi_size; u64 isize = i_size_read(inode); int ret; + gfp_t constraint_gfp, cache_gfp; struct folio *folio; struct extent_map *em; struct address_space *mapping = inode->i_mapping; @@ -406,6 +407,19 @@ static noinline int add_ra_bio_pages(struct inode *inode, end_index = (i_size_read(inode) - 1) >> PAGE_SHIFT; + /* + * Avoid direct reclaim when the caller does not allow it. Since + * add_ra_bio_pages() is always speculative, suppress allocation warnings + * in either case. + */ + if (!direct_reclaim) { + constraint_gfp = ~(__GFP_FS | __GFP_DIRECT_RECLAIM) | __GFP_NOWARN; + cache_gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; + } else { + constraint_gfp = (~__GFP_FS) | __GFP_NOWARN; + cache_gfp = GFP_NOFS | __GFP_NOWARN; + } + while (cur < compressed_end) { pgoff_t page_end; pgoff_t pg_index = cur >> PAGE_SHIFT; @@ -435,12 +449,12 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, constraint_gfp), 0, NULL); if (!folio) break; - if (filemap_add_folio(mapping, folio, pg_index, GFP_NOFS)) { + if (filemap_add_folio(mapping, folio, pg_index, cache_gfp)) { /* There is already a page, skip to page end */ cur += folio_size(folio); folio_put(folio); @@ -533,6 +547,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) unsigned int compressed_len; const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 file_offset = bbio->file_offset; + gfp_t gfp; u64 em_len; u64 em_start; struct extent_map *em; @@ -540,6 +555,17 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) int memstall = 0; int ret; + /* + * If this is a readahead bio, prevent direct reclaim. This is done to + * avoid stalling on speculative allocations when memory pressure is + * high. The demand fault will retry with GFP_NOFS and enter direct + * reclaim if needed. + */ + if (bbio->bio.bi_opf & REQ_RAHEAD) + gfp = (GFP_NOFS & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN; + else + gfp = GFP_NOFS; + /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); @@ -560,7 +586,6 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) em_start = em->start; cb->len = bbio->bio.bi_iter.bi_size; - cb->compressed_len = compressed_len; cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; cb->bbio.csum_search_commit_root = bbio->csum_search_commit_root; @@ -571,7 +596,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct folio *folio; u32 cur_len = min(compressed_len - i * min_folio_size, min_folio_size); - folio = btrfs_alloc_compr_folio(fs_info); + folio = btrfs_alloc_compr_folio(fs_info, gfp); if (!folio) { ret = -ENOMEM; goto out_free_bio; @@ -587,7 +612,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) ASSERT(cb->bbio.bio.bi_iter.bi_size == compressed_len); add_ra_bio_pages(&inode->vfs_inode, em_start + em_len, cb, &memstall, - &pflags); + &pflags, !(bbio->bio.bi_opf & REQ_RAHEAD)); cb->len = bbio->bio.bi_iter.bi_size; cb->bbio.bio.bi_iter.bi_sector = bbio->bio.bi_iter.bi_sector; diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 65b8bc4bbe0b..1022dc53ec51 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -36,6 +36,9 @@ struct btrfs_ordered_extent; #define BTRFS_MAX_COMPRESSED_PAGES (BTRFS_MAX_COMPRESSED / PAGE_SIZE) static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); +/* The max size for a single worker to compress. */ +#define BTRFS_COMPRESSION_CHUNK_SIZE (SZ_512K) + /* Maximum size of data before compression */ #define BTRFS_MAX_UNCOMPRESSED (SZ_128K) @@ -48,9 +51,6 @@ struct compressed_bio { /* Number of bytes in the inode we're working on */ unsigned int len; - /* Number of bytes on disk */ - unsigned int compressed_len; - /* The compression algorithm for this bio */ u8 compress_type; @@ -98,7 +98,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio); int btrfs_compress_str2level(unsigned int type, const char *str, int *level_ret); -struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info); +struct folio *btrfs_alloc_compr_folio(struct btrfs_fs_info *fs_info, gfp_t gfp); void btrfs_free_compr_folio(struct folio *folio); struct workspace_manager { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7267b2502665..d70da290bedf 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -21,6 +21,7 @@ #include "fs.h" #include "accessors.h" #include "extent-tree.h" +#include "extent_io.h" #include "relocation.h" #include "file-item.h" @@ -590,6 +591,9 @@ int btrfs_force_cow_block(struct btrfs_trans_handle *trans, btrfs_tree_unlock(buf); free_extent_buffer_stale(buf); btrfs_mark_buffer_dirty(trans, cow); + + btrfs_inhibit_eb_writeback(trans, cow); + *cow_ret = cow; return 0; @@ -599,9 +603,9 @@ error_unlock_cow: return ret; } -static inline bool should_cow_block(const struct btrfs_trans_handle *trans, +static inline bool should_cow_block(struct btrfs_trans_handle *trans, const struct btrfs_root *root, - const struct extent_buffer *buf) + struct extent_buffer *buf) { if (btrfs_is_testing(root->fs_info)) return false; @@ -635,6 +639,7 @@ static inline bool should_cow_block(const struct btrfs_trans_handle *trans, if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) return true; + btrfs_inhibit_eb_writeback(trans, buf); return false; } @@ -762,22 +767,21 @@ int btrfs_bin_search(const struct extent_buffer *eb, int first_slot, while (low < high) { const int unit_size = eb->folio_size; - unsigned long oil; + unsigned long oif; unsigned long offset; struct btrfs_disk_key *tmp; struct btrfs_disk_key unaligned; - int mid; + u32 mid; mid = (low + high) / 2; offset = p + mid * item_size; - oil = get_eb_offset_in_folio(eb, offset); + oif = get_eb_offset_in_folio(eb, offset); - if (oil + key_size <= unit_size) { + if (oif + key_size <= unit_size) { const unsigned long idx = get_eb_folio_index(eb, offset); char *kaddr = folio_address(eb->folios[idx]); - oil = get_eb_offset_in_folio(eb, offset); - tmp = (struct btrfs_disk_key *)(kaddr + oil); + tmp = (struct btrfs_disk_key *)(kaddr + oif); } else { read_extent_buffer(eb, &unaligned, offset, key_size); tmp = &unaligned; @@ -822,7 +826,6 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, { int level = btrfs_header_level(parent); struct btrfs_tree_parent_check check = { 0 }; - struct extent_buffer *eb; if (slot < 0 || slot >= btrfs_header_nritems(parent)) return ERR_PTR(-ENOENT); @@ -835,16 +838,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, check.has_first_key = true; btrfs_node_key_to_cpu(parent, &check.first_key, slot); - eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), - &check); - if (IS_ERR(eb)) - return eb; - if (unlikely(!extent_buffer_uptodate(eb))) { - free_extent_buffer(eb); - return ERR_PTR(-EIO); - } - - return eb; + return read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot), + &check); } /* @@ -1503,7 +1498,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, reada_for_search(fs_info, p, parent_level, slot, key->objectid); /* first we do an atomic uptodate check */ - if (btrfs_buffer_uptodate(tmp, check.transid, true) > 0) { + if (btrfs_buffer_uptodate(tmp, check.transid, NULL) > 0) { /* * Do extra check for first_key, eb can be stale due to * being cached, read from scrub, or have multiple @@ -2106,6 +2101,7 @@ again: p->nodes[level + 1])) { write_lock_level = level + 1; btrfs_release_path(p); + trace_btrfs_search_slot_restart(root, level, "write_lock"); goto again; } @@ -2168,8 +2164,10 @@ cow_done: p->slots[level] = slot; ret2 = setup_nodes_for_search(trans, root, p, b, level, ins_len, &write_lock_level); - if (ret2 == -EAGAIN) + if (ret2 == -EAGAIN) { + trace_btrfs_search_slot_restart(root, level, "setup_nodes"); goto again; + } if (ret2) { ret = ret2; goto done; @@ -2185,6 +2183,7 @@ cow_done: if (slot == 0 && ins_len && write_lock_level < level + 1) { write_lock_level = level + 1; btrfs_release_path(p); + trace_btrfs_search_slot_restart(root, level, "slot_zero"); goto again; } @@ -2198,8 +2197,10 @@ cow_done: } ret2 = read_block_for_search(root, p, &b, slot, key); - if (ret2 == -EAGAIN && !p->nowait) + if (ret2 == -EAGAIN && !p->nowait) { + trace_btrfs_search_slot_restart(root, level, "read_block"); goto again; + } if (ret2) { ret = ret2; goto done; @@ -3896,7 +3897,7 @@ static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, goto err; } - ret = split_leaf(trans, root, &key, path, ins_len, 1); + ret = split_leaf(trans, root, &key, path, ins_len, true); if (ret) goto err; diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 56ff8afe9a22..09795439b9fb 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -596,8 +596,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(fs_info, dst_rsv, num_bytes, diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 3766ff29fbbb..605858c2d9a9 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -207,6 +207,30 @@ void btrfs_dec_delayed_refs_rsv_bg_updates(struct btrfs_fs_info *fs_info) * This will refill the delayed block_rsv up to 1 items size worth of space and * will return -ENOSPC if we can't make the reservation. */ +static int btrfs_zoned_cap_metadata_reservation(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *fs_info = space_info->fs_info; + struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv; + u64 usable; + u64 cap; + int ret = 0; + + if (!btrfs_is_zoned(fs_info)) + return 0; + + spin_lock(&space_info->lock); + usable = space_info->total_bytes - space_info->bytes_zone_unusable; + spin_unlock(&space_info->lock); + cap = usable >> 1; + + spin_lock(&block_rsv->lock); + if (block_rsv->size > cap) + ret = -EAGAIN; + spin_unlock(&block_rsv->lock); + + return ret; +} + int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, enum btrfs_reserve_flush_enum flush) { @@ -228,6 +252,10 @@ int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info, if (!num_bytes) return 0; + ret = btrfs_zoned_cap_metadata_reservation(space_info); + if (ret) + return ret; + ret = btrfs_reserve_metadata_bytes(space_info, num_bytes, flush); if (ret) return ret; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index b6c7da8e1bc8..8f8fa14886de 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -697,7 +697,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, /* the disk copy procedure reuses the scrub code */ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, btrfs_device_get_total_bytes(src_device), - &dev_replace->scrub_progress, 0, 1); + &dev_replace->scrub_progress, false, true); ret = btrfs_dev_replace_finishing(fs_info, ret); if (ret == -EINPROGRESS) @@ -1255,7 +1255,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, btrfs_device_get_total_bytes(dev_replace->srcdev), - &dev_replace->scrub_progress, 0, 1); + &dev_replace->scrub_progress, false, true); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 085a83ae9e62..84f1c64423d3 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -253,9 +253,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir_ino, /* Nothing found, we're safe */ if (ret == -ENOENT) return 0; - - if (ret < 0) - return ret; + return ret; } /* we found an item, look for our name in the item */ diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 9a63200d7a53..57167d56dc72 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -107,7 +107,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered); else - ret = nowait ? -EAGAIN : -ENOTBLK; + ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* @@ -625,7 +625,7 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, pos += submitted; length -= submitted; if (write) - btrfs_finish_ordered_extent(dio_data->ordered, NULL, + btrfs_finish_ordered_extent(dio_data->ordered, pos, length, false); else btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, @@ -657,9 +657,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) } if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - btrfs_finish_ordered_extent(bbio->ordered, NULL, - dip->file_offset, dip->bytes, - !bio->bi_status); + btrfs_finish_ordered_extent(bbio->ordered, dip->file_offset, + dip->bytes, !bio->bi_status); } else { btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1, NULL); @@ -735,7 +734,7 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - btrfs_finish_ordered_extent(dio_data->ordered, NULL, + btrfs_finish_ordered_extent(dio_data->ordered, file_offset, dip->bytes, !ret); bio->bi_status = errno_to_blk_status(ret); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1b0eb246b714..8a11be02eeb9 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ #include "relocation.h" #include "scrub.h" #include "super.h" -#include "delayed-inode.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -110,19 +109,23 @@ static void csum_tree_block(struct extent_buffer *buf, u8 *result) * detect blocks that either didn't get written at all or got written * in the wrong place. */ -int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, bool atomic) +int btrfs_buffer_uptodate(struct extent_buffer *eb, u64 parent_transid, + const struct btrfs_tree_parent_check *check) { if (!extent_buffer_uptodate(eb)) return 0; - if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) { + /* + * On a cache hit, the caller may still need tree parent + * verification before reusing the buffer. + */ + if (unlikely(check && btrfs_verify_level_key(eb, check))) + return -EUCLEAN; return 1; + } - if (atomic) - return -EAGAIN; - - if (!extent_buffer_uptodate(eb) || - btrfs_header_generation(eb) != parent_transid) { + if (btrfs_header_generation(eb) != parent_transid) { btrfs_err_rl(eb->fs_info, "parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", eb->start, eb->read_mirror, @@ -730,7 +733,7 @@ void btrfs_global_root_delete(struct btrfs_root *root) } struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *key) + const struct btrfs_key *key) { struct rb_node *node; struct btrfs_root *root = NULL; @@ -767,7 +770,7 @@ static u64 btrfs_global_root_id(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_key key = { + const struct btrfs_key key = { .objectid = BTRFS_CSUM_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = btrfs_global_root_id(fs_info, bytenr), @@ -778,7 +781,7 @@ struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr) struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr) { - struct btrfs_key key = { + const struct btrfs_key key = { .objectid = BTRFS_EXTENT_TREE_OBJECTID, .type = BTRFS_ROOT_ITEM_KEY, .offset = btrfs_global_root_id(fs_info, bytenr), @@ -994,8 +997,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, root->node = NULL; goto fail; } - if (unlikely(!btrfs_buffer_uptodate(root->node, generation, false))) { - ret = -EIO; + + ret = btrfs_buffer_uptodate(root->node, generation, &check); + if (unlikely(ret <= 0)) { + if (ret == 0) + ret = -EIO; goto fail; } @@ -1550,7 +1556,7 @@ sleep: wake_up_process(fs_info->cleaner_kthread); mutex_unlock(&fs_info->transaction_kthread_mutex); - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) btrfs_cleanup_transaction(fs_info); if (!kthread_should_stop() && (!btrfs_transaction_blocked(fs_info) || @@ -2025,11 +2031,6 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, btrfs_put_root(log_tree_root); return ret; } - if (unlikely(!extent_buffer_uptodate(log_tree_root->node))) { - btrfs_err(fs_info, "failed to read log tree"); - btrfs_put_root(log_tree_root); - return -EIO; - } /* returns with log_tree_root freed on success */ ret = btrfs_recover_log_trees(log_tree_root); @@ -2299,6 +2300,15 @@ static int validate_sys_chunk_array(const struct btrfs_fs_info *fs_info, return -EUCLEAN; } + /* It must hold at least one key and one chunk. */ + if (unlikely(sys_array_size < sizeof(struct btrfs_disk_key) + + sizeof(struct btrfs_chunk))) { + btrfs_err(fs_info, "system chunk array too small %u < %zu", + sys_array_size, + sizeof(struct btrfs_disk_key) + sizeof(struct btrfs_chunk)); + return -EUCLEAN; + } + while (cur < sys_array_size) { struct btrfs_disk_key *disk_key; struct btrfs_chunk *chunk; @@ -2365,11 +2375,11 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, int ret = 0; const bool ignore_flags = btrfs_test_opt(fs_info, IGNORESUPERFLAGS); - if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + if (unlikely(btrfs_super_magic(sb) != BTRFS_MAGIC)) { btrfs_err(fs_info, "no valid FS found"); ret = -EINVAL; } - if ((btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { + if (unlikely(btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)) { if (!ignore_flags) { btrfs_err(fs_info, "unrecognized or unsupported super flag 0x%llx", @@ -2381,17 +2391,17 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); } } - if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "tree_root level too big: %d >= %d", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } - if (btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_super_chunk_root_level(sb) >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "chunk_root level too big: %d >= %d", btrfs_super_chunk_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; } - if (btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL) { + if (unlikely(btrfs_super_log_root_level(sb) >= BTRFS_MAX_LEVEL)) { btrfs_err(fs_info, "log_root level too big: %d >= %d", btrfs_super_log_root_level(sb), BTRFS_MAX_LEVEL); ret = -EINVAL; @@ -2401,65 +2411,65 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || - sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + if (unlikely(!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE)) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } - if (!btrfs_supported_blocksize(sectorsize)) { + if (unlikely(!btrfs_supported_blocksize(sectorsize))) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); ret = -EINVAL; } - if (!is_power_of_2(nodesize) || nodesize < sectorsize || - nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + if (unlikely(!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE)) { btrfs_err(fs_info, "invalid nodesize %llu", nodesize); ret = -EINVAL; } - if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + if (unlikely(nodesize != le32_to_cpu(sb->__unused_leafsize))) { btrfs_err(fs_info, "invalid leafsize %u, should be %llu", le32_to_cpu(sb->__unused_leafsize), nodesize); ret = -EINVAL; } /* Root alignment check */ - if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { + if (unlikely(!IS_ALIGNED(btrfs_super_root(sb), sectorsize))) { btrfs_err(fs_info, "tree_root block unaligned: %llu", btrfs_super_root(sb)); ret = -EINVAL; } - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { + if (unlikely(!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize))) { btrfs_err(fs_info, "chunk_root block unaligned: %llu", btrfs_super_chunk_root(sb)); ret = -EINVAL; } - if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + if (unlikely(!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize))) { btrfs_err(fs_info, "log_root block unaligned: %llu", btrfs_super_log_root(sb)); ret = -EINVAL; } - if (!fs_info->fs_devices->temp_fsid && - memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0) { + if (unlikely(!fs_info->fs_devices->temp_fsid && + memcmp(fs_info->fs_devices->fsid, sb->fsid, BTRFS_FSID_SIZE) != 0)) { btrfs_err(fs_info, "superblock fsid doesn't match fsid of fs_devices: %pU != %pU", sb->fsid, fs_info->fs_devices->fsid); ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), - BTRFS_FSID_SIZE) != 0) { + if (unlikely(memcmp(fs_info->fs_devices->metadata_uuid, btrfs_sb_fsid_ptr(sb), + BTRFS_FSID_SIZE) != 0)) { btrfs_err(fs_info, "superblock metadata_uuid doesn't match metadata uuid of fs_devices: %pU != %pU", btrfs_sb_fsid_ptr(sb), fs_info->fs_devices->metadata_uuid); ret = -EINVAL; } - if (memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, - BTRFS_FSID_SIZE) != 0) { + if (unlikely(memcmp(fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid, + BTRFS_FSID_SIZE) != 0)) { btrfs_err(fs_info, "dev_item UUID does not match metadata fsid: %pU != %pU", fs_info->fs_devices->metadata_uuid, sb->dev_item.fsid); @@ -2470,9 +2480,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Artificial requirement for block-group-tree to force newer features * (free-space-tree, no-holes) so the test matrix is smaller. */ - if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && - (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || - !btrfs_fs_incompat(fs_info, NO_HOLES))) { + if (unlikely(btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE) && + (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES)))) { btrfs_err(fs_info, "block-group-tree feature requires free-space-tree and no-holes"); ret = -EINVAL; @@ -2483,25 +2493,25 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Reduce test matrix for remap tree by requiring block-group-tree * and no-holes. Free-space-tree is a hard requirement. */ - if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || - !btrfs_fs_incompat(fs_info, NO_HOLES) || - !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE)) { + if (unlikely(!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID) || + !btrfs_fs_incompat(fs_info, NO_HOLES) || + !btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))) { btrfs_err(fs_info, "remap-tree feature requires free-space-tree, no-holes, and block-group-tree"); ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { + if (unlikely(btrfs_fs_incompat(fs_info, MIXED_GROUPS))) { btrfs_err(fs_info, "remap-tree not supported with mixed-bg"); ret = -EINVAL; } - if (btrfs_fs_incompat(fs_info, ZONED)) { + if (unlikely(btrfs_fs_incompat(fs_info, ZONED))) { btrfs_err(fs_info, "remap-tree not supported with zoned devices"); ret = -EINVAL; } - if (sectorsize > PAGE_SIZE) { + if (unlikely(sectorsize > PAGE_SIZE)) { btrfs_err(fs_info, "remap-tree not supported when block size > page size"); ret = -EINVAL; } @@ -2511,66 +2521,47 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Hint to catch really bogus numbers, bitflips or so, more exact checks are * done later */ - if (btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb)) { + if (unlikely(btrfs_super_bytes_used(sb) < 6 * btrfs_super_nodesize(sb))) { btrfs_err(fs_info, "bytes_used is too small %llu", btrfs_super_bytes_used(sb)); ret = -EINVAL; } - if (!is_power_of_2(btrfs_super_stripesize(sb))) { + if (unlikely(!is_power_of_2(btrfs_super_stripesize(sb)))) { btrfs_err(fs_info, "invalid stripesize %u", btrfs_super_stripesize(sb)); ret = -EINVAL; } - if (btrfs_super_num_devices(sb) > (1UL << 31)) + if (unlikely(btrfs_super_num_devices(sb) > (1UL << 31))) btrfs_warn(fs_info, "suspicious number of devices: %llu", btrfs_super_num_devices(sb)); - if (btrfs_super_num_devices(sb) == 0) { + if (unlikely(btrfs_super_num_devices(sb) == 0)) { btrfs_err(fs_info, "number of devices is 0"); ret = -EINVAL; } - if (mirror_num >= 0 && - btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num)) { + if (unlikely(mirror_num >= 0 && + btrfs_super_bytenr(sb) != btrfs_sb_offset(mirror_num))) { btrfs_err(fs_info, "super offset mismatch %llu != %llu", btrfs_super_bytenr(sb), btrfs_sb_offset(mirror_num)); ret = -EINVAL; } - if (ret) + if (unlikely(ret)) return ret; ret = validate_sys_chunk_array(fs_info, sb); - /* - * Obvious sys_chunk_array corruptions, it must hold at least one key - * and one chunk - */ - if (btrfs_super_sys_array_size(sb) > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { - btrfs_err(fs_info, "system chunk array too big %u > %u", - btrfs_super_sys_array_size(sb), - BTRFS_SYSTEM_CHUNK_ARRAY_SIZE); - ret = -EINVAL; - } - if (btrfs_super_sys_array_size(sb) < sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)) { - btrfs_err(fs_info, "system chunk array too small %u < %zu", - btrfs_super_sys_array_size(sb), - sizeof(struct btrfs_disk_key) - + sizeof(struct btrfs_chunk)); - ret = -EINVAL; - } - /* * The generation is a global counter, we'll trust it more than the others * but it's still possible that it's the one that's wrong. */ - if (btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb)) + if (unlikely(btrfs_super_generation(sb) < btrfs_super_chunk_root_generation(sb))) btrfs_warn(fs_info, "suspicious: generation < chunk_root_generation: %llu < %llu", btrfs_super_generation(sb), btrfs_super_chunk_root_generation(sb)); - if (btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) - && btrfs_super_cache_generation(sb) != (u64)-1) + if (unlikely(btrfs_super_generation(sb) < btrfs_super_cache_generation(sb) && + btrfs_super_cache_generation(sb) != (u64)-1)) btrfs_warn(fs_info, "suspicious: generation < cache_generation: %llu < %llu", btrfs_super_generation(sb), @@ -2601,7 +2592,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, int ret; ret = btrfs_validate_super(fs_info, sb, -1); - if (ret < 0) + if (unlikely(ret < 0)) goto out; if (unlikely(!btrfs_supported_super_csum(btrfs_super_csum_type(sb)))) { ret = -EUCLEAN; @@ -2618,7 +2609,7 @@ static int btrfs_validate_write_super(struct btrfs_fs_info *fs_info, goto out; } out: - if (ret < 0) + if (unlikely(ret < 0)) btrfs_err(fs_info, "super block corruption detected before writing it to disk"); return ret; @@ -2639,11 +2630,6 @@ static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int lev root->node = NULL; return ret; } - if (unlikely(!extent_buffer_uptodate(root->node))) { - free_extent_buffer(root->node); - root->node = NULL; - return -EIO; - } btrfs_set_root_node(&root->root_item, root->node); root->commit_root = btrfs_root_node(root); @@ -3674,7 +3660,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (fs_info->uuid_root && (btrfs_test_opt(fs_info, RESCAN_UUID_TREE) || - fs_info->generation != btrfs_super_uuid_tree_generation(disk_super))) { + !test_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags))) { btrfs_info(fs_info, "checking UUID tree"); ret = btrfs_check_uuid_tree(fs_info); if (ret) { @@ -3766,8 +3752,7 @@ static void btrfs_end_super_write(struct bio *bio) * Write superblock @sb to the @device. Do not wait for completion, all the * folios we use for writing are locked. * - * Write @max_mirrors copies of the superblock, where 0 means default that fit - * the expected device size at commit time. Note that max_mirrors must be + * Write @max_mirrors copies of the superblock. Note that max_mirrors must be * same for write and wait phases. * * Return number of errors when folio is not found or submission fails. @@ -3783,9 +3768,6 @@ static int write_dev_supers(struct btrfs_device *device, atomic_set(&device->sb_write_errors, 0); - if (max_mirrors == 0) - max_mirrors = BTRFS_SUPER_MIRROR_MAX; - for (i = 0; i < max_mirrors; i++) { struct folio *folio; struct bio *bio; @@ -3870,16 +3852,13 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) int ret; u64 bytenr; - if (max_mirrors == 0) - max_mirrors = BTRFS_SUPER_MIRROR_MAX; - for (i = 0; i < max_mirrors; i++) { struct folio *folio; ret = btrfs_sb_log_location(device, i, READ, &bytenr); if (ret == -ENOENT) { break; - } else if (ret < 0) { + } else if (unlikely(ret < 0)) { errors++; if (i == 0) primary_failed = true; @@ -3901,9 +3880,8 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) } errors += atomic_read(&device->sb_write_errors); - if (errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR) - primary_failed = true; - if (primary_failed) { + + if (unlikely(primary_failed || errors >= BTRFS_SUPER_PRIMARY_WRITE_ERROR)) { btrfs_err(device->fs_info, "error writing primary super block to device %llu", device->devid); return -1; @@ -3954,7 +3932,7 @@ static bool wait_dev_flush(struct btrfs_device *device) wait_for_completion_io(&device->flush_wait); - if (bio->bi_status) { + if (unlikely(bio->bi_status)) { set_bit(BTRFS_DEV_STATE_FLUSH_FAILED, &device->dev_state); btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); return true; @@ -3992,7 +3970,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) list_for_each_entry(dev, head, dev_list) { if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) continue; - if (!dev->bdev) { + if (unlikely(!dev->bdev)) { errors_wait++; continue; } @@ -4000,7 +3978,7 @@ static int barrier_all_devices(struct btrfs_fs_info *info) !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; - if (wait_dev_flush(dev)) + if (unlikely(wait_dev_flush(dev))) errors_wait++; } @@ -4043,26 +4021,27 @@ int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags) return min_tolerated; } -int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) +int write_all_supers(struct btrfs_trans_handle *trans) { + struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *head; struct btrfs_device *dev; struct btrfs_super_block *sb; struct btrfs_dev_item *dev_item; + int max_mirrors; int ret; int do_barriers; int max_errors; int total_errors = 0; - u64 flags; do_barriers = !btrfs_test_opt(fs_info, NOBARRIER); - /* - * max_mirrors == 0 indicates we're from commit_transaction, - * not from fsync where the tree roots in fs_info have not - * been consistent on disk. - */ - if (max_mirrors == 0) { + if (trans->transaction->state < TRANS_STATE_UNBLOCKED) { + /* We are called from fsync. */ + max_mirrors = 1; + } else { + /* We are called from transaction commit. */ + max_mirrors = BTRFS_SUPER_MIRROR_MAX; ret = backup_super_roots(fs_info); if (ret < 0) return ret; @@ -4077,17 +4056,19 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) if (do_barriers) { ret = barrier_all_devices(fs_info); - if (ret) { + if (unlikely(ret)) { mutex_unlock( &fs_info->fs_devices->device_list_mutex); - btrfs_handle_fs_error(fs_info, ret, - "errors while submitting device barriers."); + btrfs_abort_transaction(trans, ret); + btrfs_err(fs_info, "error while submitting device barriers"); return ret; } } + btrfs_set_super_flags(sb, btrfs_super_flags(sb) | BTRFS_HEADER_FLAG_WRITTEN); + list_for_each_entry(dev, head, dev_list) { - if (!dev->bdev) { + if (unlikely(!dev->bdev)) { total_errors++; continue; } @@ -4109,19 +4090,17 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) memcpy(dev_item->fsid, dev->fs_devices->metadata_uuid, BTRFS_FSID_SIZE); - flags = btrfs_super_flags(sb); - btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); - ret = btrfs_validate_write_super(fs_info, sb); if (unlikely(ret < 0)) { mutex_unlock(&fs_info->fs_devices->device_list_mutex); - btrfs_handle_fs_error(fs_info, -EUCLEAN, - "unexpected superblock corruption detected"); - return -EUCLEAN; + btrfs_abort_transaction(trans, ret); + btrfs_err(fs_info, + "unexpected superblock corruption before writing it"); + return ret; } ret = write_dev_supers(dev, sb, max_mirrors); - if (ret) + if (unlikely(ret)) total_errors++; } if (unlikely(total_errors > max_errors)) { @@ -4130,29 +4109,27 @@ int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors) mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* FUA is masked off if unsupported and can't be the reason */ - btrfs_handle_fs_error(fs_info, -EIO, - "%d errors while writing supers", - total_errors); + btrfs_abort_transaction(trans, -EIO); + btrfs_err(fs_info, "%d errors while writing supers", total_errors); return -EIO; } total_errors = 0; list_for_each_entry(dev, head, dev_list) { - if (!dev->bdev) + if (unlikely(!dev->bdev)) continue; if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) continue; ret = wait_dev_supers(dev, max_mirrors); - if (ret) + if (unlikely(ret)) total_errors++; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); if (unlikely(total_errors > max_errors)) { - btrfs_handle_fs_error(fs_info, -EIO, - "%d errors while writing supers", - total_errors); + btrfs_abort_transaction(trans, -EIO); + btrfs_err(fs_info, "%d errors while writing supers", total_errors); return -EIO; } return 0; @@ -4171,7 +4148,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, drop_ref = true; spin_unlock(&fs_info->fs_roots_radix_lock); - if (BTRFS_FS_ERROR(fs_info)) { + if (unlikely(BTRFS_FS_ERROR(fs_info))) { ASSERT(root->log_root == NULL); if (root->reloc_root) { btrfs_put_root(root->reloc_root); @@ -4457,13 +4434,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_put_block_group_cache(fs_info); - /* - * we must make sure there is not any read request to - * submit after we stopping all workers. - */ - invalidate_inode_pages2(fs_info->btree_inode->i_mapping); - btrfs_stop_all_workers(fs_info); - /* We shouldn't have any transaction open at this point */ warn_about_uncommitted_trans(fs_info); @@ -4471,6 +4441,13 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) free_root_pointers(fs_info, true); btrfs_free_fs_roots(fs_info); + /* + * We must make sure there is not any read request to + * submit after we stop all workers. + */ + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + btrfs_stop_all_workers(fs_info); + /* * We must free the block groups after dropping the fs_roots as we could * have had an IO error and have left over tree log blocks that aren't diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 5320da83d0cf..9185f8f02eeb 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -58,7 +58,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info); int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); -int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); +int write_all_supers(struct btrfs_trans_handle *trans); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); @@ -76,7 +76,7 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, int btrfs_global_root_insert(struct btrfs_root *root); void btrfs_global_root_delete(struct btrfs_root *root); struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info, - struct btrfs_key *key); + const struct btrfs_key *key); struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr); struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr); @@ -107,7 +107,8 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root) void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, bool atomic); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, + const struct btrfs_tree_parent_check *check); int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index d0dd50f7d279..626702244809 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -185,17 +185,25 @@ void btrfs_free_extent_state(struct extent_state *state) static int add_extent_changeset(struct extent_state *state, u32 bits, struct extent_changeset *changeset, - int set) + bool set) { + int ret; + if (!changeset) return 0; if (set && (state->state & bits) == bits) return 0; if (!set && (state->state & bits) == 0) return 0; - changeset->bytes_changed += state->end - state->start + 1; - return ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); + changeset->bytes_changed += state->end - state->start + 1; + if (!extent_changeset_tracks_ranges(changeset)) + return 0; + + ret = ulist_add(&changeset->range_changed, state->start, state->end, GFP_ATOMIC); + if (ret < 0) + return ret; + return 0; } static inline struct extent_state *next_state(struct extent_state *state) @@ -326,15 +334,10 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) -{ - btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err, - "extent io tree error on %s state start %llu end %llu", - opname, state->start, state->end); -} +#define extent_io_tree_panic(tree, state, opname, err) \ + btrfs_panic(btrfs_extent_io_tree_to_fs_info((tree)), (err), \ + "extent io tree error on %s state start %llu end %llu", \ + (opname), (state)->start, (state)->end) static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) { @@ -394,8 +397,9 @@ static void set_state_bits(struct extent_io_tree *tree, if (tree->owner == IO_TREE_INODE_IO) btrfs_set_delalloc_extent(tree->inode, state, bits); - ret = add_extent_changeset(state, bits_to_set, changeset, 1); - BUG_ON(ret < 0); + ret = add_extent_changeset(state, bits_to_set, changeset, true); + if (unlikely(ret)) + extent_io_tree_panic(tree, state, "add_extent_changeset", ret); state->state |= bits_to_set; } @@ -535,6 +539,24 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, return 0; } +static inline void state_wake_up(struct extent_io_tree *tree, + struct extent_state *state, u32 bits) +{ + lockdep_assert_held(&tree->lock); + + if (!(bits & EXTENT_LOCK_BITS)) + return; + + /* + * No memory barriers because the tree's lock is held while: + * + * 1) Adding waiters to the queue. + * 2) Waking up waiters. + * 3) Removing waiters from queue. + */ + cond_wake_up_nomb(&state->wq); +} + /* * Use this during tree iteration to avoid doing next node searches when it's * not needed (the current record ends at or after the target range's end). @@ -549,14 +571,14 @@ static inline struct extent_state *next_search_state(struct extent_state *state, /* * Utility function to clear some bits in an extent state struct. It will - * optionally wake up anyone waiting on this state (wake == 1). + * optionally wake up anyone waiting on this state. * * If no bits are set on the state struct after clearing things, the * struct is freed and removed from the tree */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 bits, int wake, u64 end, + u32 bits, u64 end, struct extent_changeset *changeset) { struct extent_state *next; @@ -566,20 +588,19 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, if (tree->owner == IO_TREE_INODE_IO) btrfs_clear_delalloc_extent(tree->inode, state, bits); - ret = add_extent_changeset(state, bits_to_clear, changeset, 0); - BUG_ON(ret < 0); + ret = add_extent_changeset(state, bits_to_clear, changeset, false); + if (unlikely(ret)) + extent_io_tree_panic(tree, state, "add_extent_changeset", ret); state->state &= ~bits_to_clear; - if (wake) - wake_up(&state->wq); + state_wake_up(tree, state, bits); if (state->state == 0) { + if (unlikely(!extent_state_in_tree(state))) + extent_io_tree_panic(tree, state, "extent_state_in_tree", -EUCLEAN); + next = next_search_state(state, end); - if (extent_state_in_tree(state)) { - rb_erase(&state->rb_node, &tree->state); - RB_CLEAR_NODE(&state->rb_node); - btrfs_free_extent_state(state); - } else { - WARN_ON(1); - } + rb_erase(&state->rb_node, &tree->state); + RB_CLEAR_NODE(&state->rb_node); + btrfs_free_extent_state(state); } else { merge_state(tree, state); next = next_search_state(state, end); @@ -616,8 +637,8 @@ int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 u64 last_end; int ret = 0; bool clear; - bool wake; const bool delete = (bits & EXTENT_CLEAR_ALL_BITS); + const u32 bits_to_clear = (bits & ~EXTENT_CTLBITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -630,7 +651,6 @@ int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = (bits & EXTENT_LOCK_BITS); clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); again: if (!prealloc) { @@ -696,18 +716,58 @@ hit_next: */ if (state->start < start) { + /* + * If all bits are cleared, there's no point in allocating or + * using the prealloc extent, split the state record, insert the + * prealloc record and then remove this record. We can just + * adjust this record and move on to the next without adding or + * removing anything to the tree. + */ + if (state->end <= end && (state->state & ~bits_to_clear) == 0) { + const u64 orig_start = state->start; + + if (tree->owner == IO_TREE_INODE_IO) + btrfs_split_delalloc_extent(tree->inode, state, start); + + /* + * Temporarilly ajdust this state's range to match the + * range for which we are clearing bits. + */ + state->start = start; + + ret = add_extent_changeset(state, bits_to_clear, changeset, false); + if (unlikely(ret < 0)) { + extent_io_tree_panic(tree, state, + "add_extent_changeset", ret); + goto out; + } + + if (tree->owner == IO_TREE_INODE_IO) + btrfs_clear_delalloc_extent(tree->inode, state, bits); + + /* + * Now adjust the range to the section for which no bits + * are cleared. + */ + state->start = orig_start; + state->end = start - 1; + + state_wake_up(tree, state, bits); + state = next_search_state(state, end); + goto next; + } + prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, start); prealloc = NULL; - if (ret) { + if (unlikely(ret)) { extent_io_tree_panic(tree, state, "split", ret); goto out; } if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, end, - changeset); + state = clear_state_bit(tree, state, bits, end, changeset); goto next; } if (need_resched()) @@ -724,26 +784,60 @@ hit_next: * We need to split the extent, and clear the bit on the first half. */ if (state->start <= end && state->end > end) { + /* + * If all bits are cleared, there's no point in allocating or + * using the prealloc extent, split the state record, insert the + * prealloc record and then remove it. We can just adjust the + * start offset of the current state and avoid all that. + */ + if ((state->state & ~bits_to_clear) == 0) { + const u64 orig_end = state->end; + + if (tree->owner == IO_TREE_INODE_IO) + btrfs_split_delalloc_extent(tree->inode, state, end + 1); + + /* + * Temporarily adjust the end offset to match the + * removed subrange to update the changeset. + */ + state->end = end; + + ret = add_extent_changeset(state, bits_to_clear, changeset, false); + if (unlikely(ret < 0)) { + extent_io_tree_panic(tree, state, + "add_extent_changeset", ret); + goto out; + } + + if (tree->owner == IO_TREE_INODE_IO) + btrfs_clear_delalloc_extent(tree->inode, state, bits); + + state->start = end + 1; + state->end = orig_end; + + state_wake_up(tree, state, bits); + goto out; + } + prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, end + 1); - if (ret) { + if (unlikely(ret)) { extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; goto out; } - if (wake) - wake_up(&state->wq); + state_wake_up(tree, state, bits); - clear_state_bit(tree, prealloc, bits, wake, end, changeset); + clear_state_bit(tree, prealloc, bits, end, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, bits, wake, end, changeset); + state = clear_state_bit(tree, state, bits, end, changeset); next: if (last_end >= end) goto out; @@ -825,13 +919,13 @@ process_node: } } out: + spin_unlock(&tree->lock); /* This state is no longer useful, clear it and free it up. */ if (cached_state && *cached_state) { state = *cached_state; *cached_state = NULL; btrfs_free_extent_state(state); } - spin_unlock(&tree->lock); } static void cache_state_if_flags(struct extent_state *state, @@ -1169,7 +1263,7 @@ hit_next: if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, start); - if (ret) + if (unlikely(ret)) extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; @@ -1259,7 +1353,7 @@ hit_next: if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, end + 1); - if (ret) { + if (unlikely(ret)) { extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; goto out; @@ -1382,7 +1476,7 @@ hit_next: if (state->start == start && state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + state = clear_state_bit(tree, state, clear_bits, end, NULL); if (last_end >= end) goto out; start = last_end + 1; @@ -1414,14 +1508,14 @@ hit_next: } ret = split_state(tree, state, prealloc, start); prealloc = NULL; - if (ret) { + if (unlikely(ret)) { extent_io_tree_panic(tree, state, "split", ret); goto out; } if (state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + state = clear_state_bit(tree, state, clear_bits, end, NULL); if (last_end >= end) goto out; start = last_end + 1; @@ -1498,7 +1592,7 @@ hit_next: } ret = split_state(tree, state, prealloc, end + 1); - if (ret) { + if (unlikely(ret)) { extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; goto out; @@ -1506,7 +1600,7 @@ hit_next: set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL); + clear_state_bit(tree, prealloc, clear_bits, end, NULL); prealloc = NULL; goto out; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 098e64106d02..391fad41c3b6 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4013,9 +4013,8 @@ static int do_allocation_clustered(struct btrfs_block_group *block_group, * Lock nesting * ============ * - * space_info::lock - * block_group::lock - * fs_info::treelog_bg_lock + * block_group::lock + * fs_info::treelog_bg_lock */ /* @@ -4028,7 +4027,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, struct btrfs_block_group **bg_ret) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct btrfs_space_info *space_info = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 start = block_group->start; u64 num_bytes = ffe_ctl->num_bytes; @@ -4089,7 +4087,6 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, */ } - spin_lock(&space_info->lock); spin_lock(&block_group->lock); spin_lock(&fs_info->treelog_bg_lock); spin_lock(&fs_info->relocation_bg_lock); @@ -4191,7 +4188,6 @@ out: spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); - spin_unlock(&space_info->lock); return ret; } @@ -4353,71 +4349,72 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return 1; /* See the comments for btrfs_loop_type for an explanation of the phases. */ - if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) { - ffe_ctl->index = 0; - /* - * We want to skip the LOOP_CACHING_WAIT step if we don't have - * any uncached bgs and we've already done a full search - * through. - */ - if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && - (!ffe_ctl->orig_have_caching_bg && full_search)) - ffe_ctl->loop++; + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) + return -ENOSPC; + + ffe_ctl->index = 0; + /* + * We want to skip the LOOP_CACHING_WAIT step if we don't have any + * uncached bgs and we've already done a full search through. + */ + if (ffe_ctl->loop == LOOP_CACHING_NOWAIT && + (!ffe_ctl->orig_have_caching_bg && full_search)) ffe_ctl->loop++; + ffe_ctl->loop++; - if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { - struct btrfs_trans_handle *trans; - int exist = 0; + if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) { + struct btrfs_trans_handle *trans; + bool have_trans = false; - /* Check if allocation policy allows to create a new chunk */ - ret = can_allocate_chunk(fs_info, ffe_ctl); - if (ret) - return ret; + /* Check if allocation policy allows to create a new chunk. */ + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; - trans = current->journal_info; - if (trans) - exist = 1; - else - trans = btrfs_join_transaction(root); + trans = current->journal_info; + if (trans) + have_trans = true; + else + trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + if (IS_ERR(trans)) + return PTR_ERR(trans); - ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, - CHUNK_ALLOC_FORCE_FOR_EXTENT); + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, + CHUNK_ALLOC_FORCE_FOR_EXTENT); - /* Do not bail out on ENOSPC since we can do more. */ - if (ret == -ENOSPC) { - ret = 0; - ffe_ctl->loop++; - } - else if (ret < 0) - btrfs_abort_transaction(trans, ret); - else - ret = 0; - if (!exist) - btrfs_end_transaction(trans); - if (ret) - return ret; + /* Do not bail out on ENOSPC since we can do more. */ + if (ret == -ENOSPC) { + ret = 0; + ffe_ctl->loop++; + } else if (ret < 0) { + btrfs_abort_transaction(trans, ret); + } else { + ret = 0; } - if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { - if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) - return -ENOSPC; + if (!have_trans) + btrfs_end_transaction(trans); - /* - * Don't loop again if we already have no empty_size and - * no empty_cluster. - */ - if (ffe_ctl->empty_size == 0 && - ffe_ctl->empty_cluster == 0) - return -ENOSPC; - ffe_ctl->empty_size = 0; - ffe_ctl->empty_cluster = 0; - } - return 1; + if (ret) + return ret; } - return -ENOSPC; + + if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) { + if (ffe_ctl->policy != BTRFS_EXTENT_ALLOC_CLUSTERED) + return -ENOSPC; + + /* + * Don't loop again if we already have no empty_size and + * no empty_cluster. + */ + if (ffe_ctl->empty_size == 0 && ffe_ctl->empty_cluster == 0) + return -ENOSPC; + ffe_ctl->empty_size = 0; + ffe_ctl->empty_cluster = 0; + } + + return 1; } static int prepare_allocation_clustered(struct btrfs_fs_info *fs_info, @@ -5784,7 +5781,7 @@ static int check_next_block_uptodate(struct btrfs_trans_handle *trans, generation = btrfs_node_ptr_generation(path->nodes[level], path->slots[level]); - if (btrfs_buffer_uptodate(next, generation, false)) + if (btrfs_buffer_uptodate(next, generation, NULL)) return 0; check.level = level - 1; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5f97a3d2a8d7..1ba8a7d3587b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -520,7 +521,7 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); struct folio_iter fi; - const u32 sectorsize = fs_info->sectorsize; + u32 bio_size = 0; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, bio) { @@ -528,23 +529,16 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; - /* Our read/write should always be sector aligned. */ - if (!IS_ALIGNED(fi.offset, sectorsize)) - btrfs_err(fs_info, - "partial page write in btrfs with offset %zu and length %zu", - fi.offset, fi.length); - else if (!IS_ALIGNED(fi.length, sectorsize)) - btrfs_info(fs_info, - "incomplete page write with offset %zu and length %zu", - fi.offset, fi.length); - - btrfs_finish_ordered_extent(bbio->ordered, folio, start, len, - !error); - if (error) - mapping_set_error(folio->mapping, error); + bio_size += len; + ASSERT(btrfs_folio_test_ordered(fs_info, folio, start, len)); + btrfs_folio_clear_ordered(fs_info, folio, start, len); btrfs_folio_clear_writeback(fs_info, folio, start, len); } + if (error) + mapping_set_error(bbio->inode->vfs_inode.i_mapping, error); + + btrfs_finish_ordered_extent(bbio->ordered, bbio->file_offset, bio_size, !error); bio_put(bio); } @@ -1587,7 +1581,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, u64 start = page_start + (start_bit << fs_info->sectorsize_bits); u32 len = (end_bit - start_bit) << fs_info->sectorsize_bits; - btrfs_mark_ordered_io_finished(inode, folio, start, len, false); + btrfs_folio_clear_ordered(fs_info, folio, start, len); + btrfs_mark_ordered_io_finished(inode, start, len, false); } return ret; } @@ -1663,6 +1658,7 @@ static int submit_one_sector(struct btrfs_inode *inode, * ordered extent. */ btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize); + btrfs_folio_clear_ordered(fs_info, folio, filepos, sectorsize); btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize); btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize); @@ -1670,8 +1666,8 @@ static int submit_one_sector(struct btrfs_inode *inode, * Since there is no bio submitted to finish the ordered * extent, we have to manually finish this sector. */ - btrfs_mark_ordered_io_finished(inode, folio, filepos, - fs_info->sectorsize, false); + btrfs_mark_ordered_io_finished(inode, filepos, fs_info->sectorsize, + false); return PTR_ERR(em); } @@ -1783,8 +1779,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, spin_unlock(&inode->ordered_tree_lock); btrfs_put_ordered_extent(ordered); - btrfs_mark_ordered_io_finished(inode, folio, cur, - fs_info->sectorsize, true); + btrfs_folio_clear_ordered(fs_info, folio, cur, fs_info->sectorsize); + btrfs_mark_ordered_io_finished(inode, cur, fs_info->sectorsize, true); /* * This range is beyond i_size, thus we don't need to * bother writing back. @@ -1949,7 +1945,9 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e * of time. */ spin_lock(&eb->refs_lock); - if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + if ((wbc->sync_mode == WB_SYNC_ALL || + atomic_read(&eb->writeback_inhibitors) == 0) && + test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->nodesize_bits); unsigned long flags; @@ -2396,39 +2394,13 @@ retry: index = 0; goto retry; } + /* - * If something went wrong, don't allow any metadata write bio to be - * submitted. - * - * This would prevent use-after-free if we had dirty pages not - * cleaned up, which can still happen by fuzzed images. - * - * - Bad extent tree - * Allowing existing tree block to be allocated for other trees. - * - * - Log tree operations - * Exiting tree blocks get allocated to log tree, bumps its - * generation, then get cleaned in tree re-balance. - * Such tree block will not be written back, since it's clean, - * thus no WRITTEN flag set. - * And after log writes back, this tree block is not traced by - * any dirty extent_io_tree. - * - * - Offending tree block gets re-dirtied from its original owner - * Since it has bumped generation, no WRITTEN flag, it can be - * reused without COWing. This tree block will not be traced - * by btrfs_transaction::dirty_pages. - * - * Now such dirty tree block will not be cleaned by any dirty - * extent io tree. Thus we don't want to submit such wild eb - * if the fs already has error. - * - * We can get ret > 0 from submit_extent_folio() indicating how many ebs - * were submitted. Reset it to 0 to avoid false alerts for the caller. + * Only btrfs_check_meta_write_pointer() can update @ret, + * and it only returns 0 or errors. */ - if (ret > 0) - ret = 0; - if (!ret && BTRFS_FS_ERROR(fs_info)) + ASSERT(ret <= 0); + if (unlikely(!ret && BTRFS_FS_ERROR(fs_info))) ret = -EROFS; if (ctx.zoned_bg) @@ -2659,8 +2631,7 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f if (IS_ERR(folio)) { cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); cur_len = cur_end + 1 - cur; - btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, - cur, cur_len, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); cur = cur_end; continue; @@ -3011,6 +2982,64 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } +/* + * Inhibit writeback on buffer during transaction. + * + * @trans: transaction handle that will own the inhibitor + * @eb: extent buffer to inhibit writeback on + * + * Attempt to track this extent buffer in the transaction's inhibited set. If + * memory allocation fails, the buffer is simply not tracked. It may be written + * back and need re-COW, which is the original behavior. This is acceptable + * since inhibiting writeback is an optimization. + */ +void btrfs_inhibit_eb_writeback(struct btrfs_trans_handle *trans, struct extent_buffer *eb) +{ + unsigned long index = eb->start >> trans->fs_info->nodesize_bits; + void *old; + + lockdep_assert_held(&eb->lock); + /* Check if already inhibited by this handle. */ + old = xa_load(&trans->writeback_inhibited_ebs, index); + if (old == eb) + return; + + /* Take reference for the xarray entry. */ + refcount_inc(&eb->refs); + + old = xa_store(&trans->writeback_inhibited_ebs, index, eb, GFP_NOFS); + if (xa_is_err(old)) { + /* Allocation failed, just skip inhibiting this buffer. */ + free_extent_buffer(eb); + return; + } + + /* Handle replacement of different eb at same index. */ + if (old && old != eb) { + struct extent_buffer *old_eb = old; + + atomic_dec(&old_eb->writeback_inhibitors); + free_extent_buffer(old_eb); + } + + atomic_inc(&eb->writeback_inhibitors); +} + +/* + * Uninhibit writeback on all extent buffers. + */ +void btrfs_uninhibit_all_eb_writeback(struct btrfs_trans_handle *trans) +{ + struct extent_buffer *eb; + unsigned long index; + + xa_for_each(&trans->writeback_inhibited_ebs, index, eb) { + atomic_dec(&eb->writeback_inhibitors); + free_extent_buffer(eb); + } + xa_destroy(&trans->writeback_inhibited_ebs); +} + static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { @@ -3021,6 +3050,7 @@ static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); + atomic_set(&eb->writeback_inhibitors, 0); btrfs_leak_debug_add_eb(eb); @@ -3871,8 +3901,17 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, struct btrfs_fs_info *fs_info = eb->fs_info; struct btrfs_bio *bbio; - if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) + if (extent_buffer_uptodate(eb)) { + int ret; + + ret = btrfs_buffer_uptodate(eb, 0, check); + if (unlikely(ret <= 0)) { + if (ret == 0) + ret = -EIO; + return ret; + } return 0; + } /* * We could have had EXTENT_BUFFER_UPTODATE cleared by the write @@ -3892,8 +3931,16 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, * started and finished reading the same eb. In this case, UPTODATE * will now be set, and we shouldn't read it in again. */ - if (unlikely(test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) { + if (unlikely(extent_buffer_uptodate(eb))) { + int ret; + clear_extent_buffer_reading(eb); + ret = btrfs_buffer_uptodate(eb, 0, check); + if (unlikely(ret <= 0)) { + if (ret == 0) + ret = -EIO; + return ret; + } return 0; } @@ -3929,7 +3976,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int mirror_num, return ret; wait_on_bit_io(&eb->bflags, EXTENT_BUFFER_READING, TASK_UNINTERRUPTIBLE); - if (unlikely(!test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags))) + if (unlikely(!extent_buffer_uptodate(eb))) return -EIO; return 0; } @@ -3971,7 +4018,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, size_t cur; size_t offset; char *dst = (char *)dstv; - unsigned long i = get_eb_folio_index(eb, start); + unsigned long i; if (check_eb_range(eb, start, len)) { /* @@ -3988,7 +4035,7 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, } offset = get_eb_offset_in_folio(eb, start); - + i = get_eb_folio_index(eb, start); while (len > 0) { char *kaddr; @@ -4011,11 +4058,11 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, size_t cur; size_t offset; char __user *dst = (char __user *)dstv; - unsigned long i = get_eb_folio_index(eb, start); + unsigned long i; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; if (eb->addr) { if (copy_to_user_nofault(dstv, eb->addr + start, len)) @@ -4024,7 +4071,7 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, } offset = get_eb_offset_in_folio(eb, start); - + i = get_eb_folio_index(eb, start); while (len > 0) { char *kaddr; @@ -4052,7 +4099,7 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, size_t offset; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = get_eb_folio_index(eb, start); + unsigned long i; int ret = 0; if (check_eb_range(eb, start, len)) @@ -4062,7 +4109,7 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, return memcmp(ptrv, eb->addr + start, len); offset = get_eb_offset_in_folio(eb, start); - + i = get_eb_folio_index(eb, start); while (len > 0) { cur = min(len, unit_size - offset); kaddr = folio_address(eb->folios[i]); @@ -4122,7 +4169,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, size_t offset; char *kaddr; const char *src = (const char *)srcv; - unsigned long i = get_eb_folio_index(eb, start); + unsigned long i; /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); @@ -4138,7 +4185,7 @@ static void __write_extent_buffer(const struct extent_buffer *eb, } offset = get_eb_offset_in_folio(eb, start); - + i = get_eb_folio_index(eb, start); while (len > 0) { if (check_uptodate) assert_eb_folio_uptodate(eb, i); @@ -4224,7 +4271,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, size_t cur; size_t offset; char *kaddr; - unsigned long i = get_eb_folio_index(dst, dst_offset); + unsigned long i; if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -4234,6 +4281,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, offset = get_eb_offset_in_folio(dst, dst_offset); + i = get_eb_folio_index(dst, dst_offset); while (len > 0) { assert_eb_folio_uptodate(dst, i); @@ -4606,7 +4654,7 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, if (IS_ERR(eb)) return; - if (btrfs_buffer_uptodate(eb, gen, true)) { + if (btrfs_buffer_uptodate(eb, gen, NULL)) { free_extent_buffer(eb); return; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8d05f1a58b7c..fd209233317f 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -99,6 +99,8 @@ struct extent_buffer { spinlock_t refs_lock; refcount_t refs; int read_mirror; + /* Inhibit WB_SYNC_NONE writeback when > 0. */ + atomic_t writeback_inhibitors; /* >= 0 if eb belongs to a log tree, -1 otherwise */ s8 log_index; u8 folio_shift; @@ -196,6 +198,25 @@ static inline void extent_changeset_init(struct extent_changeset *changeset) ulist_init(&changeset->range_changed); } +/* + * Sentinel value for range_changed.prealloc indicating that the changeset + * only tracks bytes_changed and does not record individual ranges. This + * avoids GFP_ATOMIC allocations inside add_extent_changeset() when the + * caller doesn't need to iterate the changed ranges afterwards. + */ +#define EXTENT_CHANGESET_BYTES_ONLY ((struct ulist_node *)1) + +static inline void extent_changeset_init_bytes_only(struct extent_changeset *changeset) +{ + changeset->bytes_changed = 0; + changeset->range_changed.prealloc = EXTENT_CHANGESET_BYTES_ONLY; +} + +static inline bool extent_changeset_tracks_ranges(const struct extent_changeset *changeset) +{ + return changeset->range_changed.prealloc != EXTENT_CHANGESET_BYTES_ONLY; +} + static inline struct extent_changeset *extent_changeset_alloc(void) { struct extent_changeset *ret; @@ -210,6 +231,7 @@ static inline struct extent_changeset *extent_changeset_alloc(void) static inline void extent_changeset_prealloc(struct extent_changeset *changeset, gfp_t gfp_mask) { + ASSERT(extent_changeset_tracks_ranges(changeset)); ulist_prealloc(&changeset->range_changed, gfp_mask); } @@ -218,7 +240,8 @@ static inline void extent_changeset_release(struct extent_changeset *changeset) if (!changeset) return; changeset->bytes_changed = 0; - ulist_release(&changeset->range_changed); + if (extent_changeset_tracks_ranges(changeset)) + ulist_release(&changeset->range_changed); } static inline void extent_changeset_free(struct extent_changeset *changeset) @@ -298,7 +321,7 @@ static inline int __pure num_extent_folios(const struct extent_buffer *eb) return num_extent_pages(eb); } -static inline int extent_buffer_uptodate(const struct extent_buffer *eb) +static inline bool extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); } @@ -381,4 +404,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info); #define btrfs_extent_buffer_leak_debug_check(fs_info) do {} while (0) #endif +void btrfs_inhibit_eb_writeback(struct btrfs_trans_handle *trans, + struct extent_buffer *eb); +void btrfs_uninhibit_all_eb_writeback(struct btrfs_trans_handle *trans); + #endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index ed8ecf44fbd0..d72249390030 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -1097,9 +1097,9 @@ static int find_next_csum_offset(struct btrfs_root *root, return 0; } -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums) +int btrfs_insert_data_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 5645c5e3abdb..6c678787c770 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -61,9 +61,9 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid, u64 bytenr, int mod); -int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_ordered_sum *sums); +int btrfs_insert_data_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); int btrfs_csum_one_bio(struct btrfs_bio *bbio, bool async); int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index a4cb9d3cfc4e..cf1cb5c4db75 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1445,7 +1445,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, * have opened a file as writable, we have to stop this write operation * to ensure consistency. */ - if (BTRFS_FS_ERROR(inode->root->fs_info)) + if (unlikely(BTRFS_FS_ERROR(inode->root->fs_info))) return -EROFS; if (encoded && (iocb->ki_flags & IOCB_NOWAIT)) @@ -3316,8 +3316,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end *delalloc_start_ret = start; delalloc_len = btrfs_count_range_bits(&inode->io_tree, delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1, - cached_state); + len, EXTENT_DELALLOC, + true, cached_state); } else { spin_unlock(&inode->lock); } diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 3de3b517810e..a4758d94b32e 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include "extent-io-tree.h" @@ -966,13 +967,13 @@ struct btrfs_fs_info { #define inode_to_fs_info(_inode) (BTRFS_I(_Generic((_inode), \ struct inode *: (_inode)))->root->fs_info) -static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) +static inline gfp_t btrfs_alloc_write_mask(const struct address_space *mapping) { return mapping_gfp_constraint(mapping, ~__GFP_FS); } /* Return the minimal folio size of the fs. */ -static inline unsigned int btrfs_min_folio_size(struct btrfs_fs_info *fs_info) +static inline unsigned int btrfs_min_folio_size(const struct btrfs_fs_info *fs_info) { return 1U << (PAGE_SHIFT + fs_info->block_min_order); } @@ -1199,8 +1200,10 @@ static inline void btrfs_force_shutdown(struct btrfs_fs_info *fs_info) * So here we only mark the fs error without flipping it RO. */ WRITE_ONCE(fs_info->fs_error, -EIO); - if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) + if (!test_and_set_bit(BTRFS_FS_STATE_EMERGENCY_SHUTDOWN, &fs_info->fs_state)) { btrfs_crit(fs_info, "emergency shutdown"); + fserror_report_shutdown(fs_info->sb, GFP_KERNEL); + } } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f643a0520872..40474014c03f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -74,7 +74,6 @@ #include "delayed-inode.h" #define COW_FILE_RANGE_KEEP_LOCKED (1UL << 0) -#define COW_FILE_RANGE_NO_INLINE (1UL << 1) struct btrfs_iget_args { u64 ino; @@ -424,7 +423,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, folio_put(folio); } - return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, offset, bytes, false); } static int btrfs_dirty_inode(struct btrfs_inode *inode); @@ -622,6 +621,10 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, * * If being used directly, you must have already checked we're allowed to cow * the range by getting true from can_cow_file_range_inline(). + * + * Return 0 if the inlined extent is created successfully. + * Return <0 for critical error, and should be considered as an writeback error. + * Return >0 if can not create an inlined extent (mostly due to lack of meta space). */ static noinline int __cow_file_range_inline(struct btrfs_inode *inode, u64 size, size_t compressed_size, @@ -703,55 +706,6 @@ out: return ret; } -static noinline int cow_file_range_inline(struct btrfs_inode *inode, - struct folio *locked_folio, - u64 offset, u64 end, - size_t compressed_size, - int compress_type, - struct folio *compressed_folio, - bool update_i_size) -{ - struct extent_state *cached = NULL; - unsigned long clear_flags = EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING | EXTENT_LOCKED; - u64 size = min_t(u64, i_size_read(&inode->vfs_inode), end + 1); - int ret; - - if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) - return 1; - - btrfs_lock_extent(&inode->io_tree, offset, end, &cached); - ret = __cow_file_range_inline(inode, size, compressed_size, - compress_type, compressed_folio, - update_i_size); - if (ret > 0) { - btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); - return ret; - } - - /* - * In the successful case (ret == 0 here), cow_file_range will return 1. - * - * Quite a bit further up the callstack in extent_writepage(), ret == 1 - * is treated as a short circuited success and does not unlock the folio, - * so we must do it here. - * - * In the failure case, the locked_folio does get unlocked by - * btrfs_folio_end_all_writers, which asserts that it is still locked - * at that point, so we must *not* unlock it here. - * - * The other two callsites in compress_file_range do not have a - * locked_folio, so they are not relevant to this logic. - */ - if (ret == 0) - locked_folio = NULL; - - extent_clear_unlock_delalloc(inode, offset, end, locked_folio, &cached, - clear_flags, PAGE_UNLOCK | - PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); - return ret; -} - struct async_extent { u64 start; u64 ram_size; @@ -797,7 +751,7 @@ static int add_async_extent(struct async_chunk *cow, u64 start, u64 ram_size, * options, defragmentation, properties or heuristics. */ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, - u64 end) + u64 end, bool check_inline) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -811,8 +765,10 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * do not even bother try compression, as there will be no space saving * and will always fallback to regular write later. */ - if (start != 0 && end + 1 - start <= fs_info->sectorsize) + if (end + 1 - start <= fs_info->sectorsize && + (!check_inline || (start > 0 || end + 1 < inode->disk_i_size))) return 0; + /* Defrag ioctl takes precedence over mount options and properties. */ if (inode->defrag_compress == BTRFS_DEFRAG_DONT_COMPRESS) return 0; @@ -890,28 +846,20 @@ static struct folio *compressed_bio_last_folio(struct compressed_bio *cb) return page_folio(phys_to_page(paddr)); } -static void zero_last_folio(struct compressed_bio *cb) -{ - struct bio *bio = &cb->bbio.bio; - struct folio *last_folio = compressed_bio_last_folio(cb); - const u32 bio_size = bio->bi_iter.bi_size; - const u32 foffset = offset_in_folio(last_folio, bio_size); - - folio_zero_range(last_folio, foffset, folio_size(last_folio) - foffset); -} - static void round_up_last_block(struct compressed_bio *cb, u32 blocksize) { struct bio *bio = &cb->bbio.bio; struct folio *last_folio = compressed_bio_last_folio(cb); const u32 bio_size = bio->bi_iter.bi_size; const u32 foffset = offset_in_folio(last_folio, bio_size); + const u32 padding_len = round_up(foffset, blocksize) - foffset; bool ret; if (IS_ALIGNED(bio_size, blocksize)) return; - ret = bio_add_folio(bio, last_folio, round_up(foffset, blocksize) - foffset, foffset); + folio_zero_range(last_folio, foffset, padding_len); + ret = bio_add_folio(bio, last_folio, padding_len, foffset); /* The remaining part should be merged thus never fail. */ ASSERT(ret); } @@ -935,9 +883,7 @@ static void compress_file_range(struct btrfs_work *work) container_of(work, struct async_chunk, work); struct btrfs_inode *inode = async_chunk->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct address_space *mapping = inode->vfs_inode.i_mapping; struct compressed_bio *cb = NULL; - const u32 min_folio_size = btrfs_min_folio_size(fs_info); u64 blocksize = fs_info->sectorsize; u64 start = async_chunk->start; u64 end = async_chunk->end; @@ -947,7 +893,6 @@ static void compress_file_range(struct btrfs_work *work) int ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned int loff; int compress_type = fs_info->compress_type; int compress_level = fs_info->compress_level; @@ -1009,7 +954,7 @@ again: * been flagged as NOCOMPRESS. This flag can change at any time if we * discover bad compression ratios. */ - if (!inode_need_compress(inode, start, end)) + if (!inode_need_compress(inode, start, end, false)) goto cleanup_and_bail_uncompressed; if (0 < inode->defrag_compress && inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { @@ -1030,43 +975,13 @@ again: total_compressed = cb->bbio.bio.bi_iter.bi_size; total_in = cur_len; - /* - * Zero the tail end of the last folio, as we might be sending it down - * to disk. - */ - loff = (total_compressed & (min_folio_size - 1)); - if (loff) - zero_last_folio(cb); - - /* - * Try to create an inline extent. - * - * If we didn't compress the entire range, try to create an uncompressed - * inline extent, else a compressed one. - * - * Check cow_file_range() for why we don't even try to create inline - * extent for the subpage case. - */ - if (total_in < actual_end) - ret = cow_file_range_inline(inode, NULL, start, end, 0, - BTRFS_COMPRESS_NONE, NULL, false); - else - ret = cow_file_range_inline(inode, NULL, start, end, total_compressed, - compress_type, - bio_first_folio_all(&cb->bbio.bio), false); - if (ret <= 0) { - cleanup_compressed_bio(cb); - if (ret < 0) - mapping_set_error(mapping, -EIO); - return; - } - /* * We aren't doing an inline extent. Round the compressed size up to a * block size boundary so the allocator does sane things. */ - total_compressed = ALIGN(total_compressed, blocksize); round_up_last_block(cb, blocksize); + total_compressed = cb->bbio.bio.bi_iter.bi_size; + ASSERT(IS_ALIGNED(total_compressed, blocksize)); /* * One last check to make sure the compression is really a win, compare @@ -1437,11 +1352,6 @@ free_reserved: * * When this function fails, it unlocks all folios except @locked_folio. * - * When this function successfully creates an inline extent, it returns 1 and - * unlocks all folios including locked_folio and starts I/O on them. - * (In reality inline extents are limited to a single block, so locked_folio is - * the only folio handled anyway). - * * When this function succeed and creates a normal extent, the folio locking * status depends on the passed in flags: * @@ -1485,25 +1395,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, ASSERT(num_bytes <= btrfs_super_total_bytes(fs_info->super_copy)); inode_should_defrag(inode, start, end, num_bytes, SZ_64K); - - if (!(flags & COW_FILE_RANGE_NO_INLINE)) { - /* lets try to make an inline extent */ - ret = cow_file_range_inline(inode, locked_folio, start, end, 0, - BTRFS_COMPRESS_NONE, NULL, false); - if (ret <= 0) { - /* - * We succeeded, return 1 so the caller knows we're done - * with this page and already handled the IO. - * - * If there was an error then cow_file_range_inline() has - * already done the cleanup. - */ - if (ret == 0) - ret = 1; - goto done; - } - } - alloc_hint = btrfs_get_extent_allocation_hint(inode, start, num_bytes); /* @@ -1581,7 +1472,6 @@ static noinline int cow_file_range(struct btrfs_inode *inode, } extent_clear_unlock_delalloc(inode, orig_start, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC, page_ops); -done: if (done_offset) *done_offset = end; return ret; @@ -1701,7 +1591,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, struct async_cow *ctx; struct async_chunk *async_chunk; unsigned long nr_pages; - u64 num_chunks = DIV_ROUND_UP(end - start, SZ_512K); + u64 num_chunks = DIV_ROUND_UP(end - start, BTRFS_COMPRESSION_CHUNK_SIZE); int i; unsigned nofs_flag; const blk_opf_t write_flags = wbc_to_write_flags(wbc); @@ -1718,7 +1608,7 @@ static bool run_delalloc_compressed(struct btrfs_inode *inode, atomic_set(&ctx->num_chunks, num_chunks); for (i = 0; i < num_chunks; i++) { - u64 cur_end = min(end, start + SZ_512K - 1); + u64 cur_end = min(end, start + BTRFS_COMPRESSION_CHUNK_SIZE - 1); /* * igrab is called higher up in the call chain, take only the @@ -1853,7 +1743,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, */ btrfs_lock_extent(io_tree, start, end, &cached_state); count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + EXTENT_NORESERVE, false, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1884,7 +1774,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, * a locked folio, which can race with writeback. */ ret = cow_file_range(inode, locked_folio, start, end, NULL, - COW_FILE_RANGE_NO_INLINE | COW_FILE_RANGE_KEEP_LOCKED); + COW_FILE_RANGE_KEEP_LOCKED); ASSERT(ret != 1); return ret; } @@ -1936,6 +1826,11 @@ static int can_nocow_file_extent(struct btrfs_path *path, int ret = 0; bool nowait = path->nowait; + /* If there are pending snapshots for this root, we must do COW. */ + if (args->writeback_path && !is_freespace_inode && + atomic_read(&root->snapshot_force_cow)) + goto out; + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); @@ -1997,11 +1892,6 @@ static int can_nocow_file_extent(struct btrfs_path *path, path = NULL; } - /* If there are pending snapshots for this root, we must COW. */ - if (args->writeback_path && !is_freespace_inode && - atomic_read(&root->snapshot_force_cow)) - goto out; - args->file_extent.num_bytes = min(args->end + 1, extent_end) - args->start; args->file_extent.offset += args->start - key->offset; io_start = args->file_extent.disk_bytenr + args->file_extent.offset; @@ -2435,6 +2325,91 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) return false; } +/* + * Return 0 if an inlined extent is created successfully. + * Return <0 if critical error happened. + * Return >0 if an inline extent can not be created. + */ +static int run_delalloc_inline(struct btrfs_inode *inode, struct folio *locked_folio) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct compressed_bio *cb = NULL; + struct extent_state *cached = NULL; + const u64 i_size = i_size_read(&inode->vfs_inode); + const u32 blocksize = fs_info->sectorsize; + int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; + u32 compressed_size = 0; + int ret; + + ASSERT(folio_pos(locked_folio) == 0); + + if (btrfs_inode_can_compress(inode) && + inode_need_compress(inode, 0, blocksize, true)) { + if (inode->defrag_compress > 0 && + inode->defrag_compress < BTRFS_NR_COMPRESS_TYPES) { + compress_type = inode->defrag_compress; + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { + compress_type = inode->prop_compress; + } + cb = btrfs_compress_bio(inode, 0, blocksize, compress_type, compress_level, 0); + if (IS_ERR(cb)) { + cb = NULL; + /* Just fall back to non-compressed case. */ + } else { + compressed_size = cb->bbio.bio.bi_iter.bi_size; + } + } + if (!can_cow_file_range_inline(inode, 0, i_size, compressed_size)) { + if (cb) + cleanup_compressed_bio(cb); + return 1; + } + + btrfs_lock_extent(&inode->io_tree, 0, blocksize - 1, &cached); + if (cb) { + ret = __cow_file_range_inline(inode, i_size, compressed_size, compress_type, + bio_first_folio_all(&cb->bbio.bio), false); + cleanup_compressed_bio(cb); + cb = NULL; + } else { + ret = __cow_file_range_inline(inode, i_size, 0, BTRFS_COMPRESS_NONE, + NULL, false); + } + /* + * We failed to insert inline extent due to lack of meta space. + * Just unlock the extent io range and fallback to regular COW/NOCOW path. + */ + if (ret > 0) { + btrfs_unlock_extent(&inode->io_tree, 0, blocksize - 1, &cached); + return ret; + } + + /* + * In the successful case (ret == 0 here), btrfs_run_delalloc_range() + * will return 1. + * + * Quite a bit further up the callstack in extent_writepage(), ret == 1 + * is treated as a short circuited success and does not unlock the folio, + * so we must do it here. + * + * For failure case, the @locked_folio does get unlocked by + * btrfs_folio_end_lock_bitmap(), so we must *not* unlock it here. + * + * So if ret == 0, we let extent_clear_unlock_delalloc() to unlock the + * folio by passing NULL as @locked_folio. + * Otherwise pass @locked_folio as usual. + */ + if (ret == 0) + locked_folio = NULL; + extent_clear_unlock_delalloc(inode, 0, blocksize - 1, locked_folio, &cached, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | + EXTENT_DO_ACCOUNTING | EXTENT_LOCKED, + PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK); + return ret; +} + /* * Function to process delayed allocation (create CoW) for ranges which are * being touched for the first time. @@ -2451,11 +2426,26 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol ASSERT(!(end <= folio_pos(locked_folio) || start >= folio_next_pos(locked_folio))); + if (start == 0 && end + 1 <= inode->root->fs_info->sectorsize && + end + 1 >= inode->disk_i_size) { + int ret; + + ret = run_delalloc_inline(inode, locked_folio); + if (ret < 0) + return ret; + if (ret == 0) + return 1; + /* + * Continue regular handling if we can not create an + * inlined extent. + */ + } + if (should_nocow(inode, start, end)) return run_delalloc_nocow(inode, locked_folio, start, end); if (btrfs_inode_can_compress(inode) && - inode_need_compress(inode, start, end) && + inode_need_compress(inode, start, end, false) && run_delalloc_compressed(inode, locked_folio, start, end, wbc)) return 1; @@ -2745,17 +2735,19 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, } /* - * given a list of ordered sums record them in the inode. This happens - * at IO completion time based on sums calculated at bio submission time. + * Given an ordered extent and insert all its checksums into the csum tree. + * + * This happens at IO completion time based on sums calculated at bio + * submission time. */ static int add_pending_csums(struct btrfs_trans_handle *trans, - struct list_head *list) + struct btrfs_ordered_extent *oe) { struct btrfs_ordered_sum *sum; struct btrfs_root *csum_root = NULL; int ret; - list_for_each_entry(sum, list, list) { + list_for_each_entry(sum, &oe->csum_list, list) { if (!csum_root) { csum_root = btrfs_csum_root(trans->fs_info, sum->logical); @@ -2767,7 +2759,7 @@ static int add_pending_csums(struct btrfs_trans_handle *trans, } } trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, csum_root, sum); + ret = btrfs_insert_data_csums(trans, csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2956,7 +2948,9 @@ out_page: * to reflect the errors and clean the page. */ mapping_set_error(folio->mapping, ret); - btrfs_mark_ordered_io_finished(inode, folio, page_start, + btrfs_folio_clear_ordered(fs_info, folio, page_start, + folio_size(folio)); + btrfs_mark_ordered_io_finished(inode, page_start, folio_size(folio), !ret); folio_clear_dirty_for_io(folio); } @@ -3203,7 +3197,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) bool freespace_inode; bool truncated = false; bool clear_reserved_extent = true; - unsigned int clear_bits = EXTENT_DEFRAG; + unsigned int clear_bits = 0; start = ordered_extent->file_offset; end = start + ordered_extent->num_bytes - 1; @@ -3214,6 +3208,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) !test_bit(BTRFS_ORDERED_ENCODED, &ordered_extent->flags)) clear_bits |= EXTENT_DELALLOC_NEW; + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) + clear_bits |= EXTENT_DEFRAG; + freespace_inode = btrfs_is_free_space_inode(inode); if (!freespace_inode) btrfs_lockdep_acquire(fs_info, btrfs_ordered_extent); @@ -3271,8 +3268,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { /* Logic error */ - ASSERT(list_empty(&ordered_extent->list)); - if (unlikely(!list_empty(&ordered_extent->list))) { + ASSERT(list_empty(&ordered_extent->csum_list)); + if (unlikely(!list_empty(&ordered_extent->csum_list))) { ret = -EINVAL; btrfs_abort_transaction(trans, ret); goto out; @@ -3321,7 +3318,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, &ordered_extent->list); + ret = add_pending_csums(trans, ordered_extent); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out; @@ -3345,8 +3342,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } out: - btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + if (clear_bits) + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); @@ -3427,7 +3425,7 @@ out: * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -4697,7 +4695,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) dir_id = btrfs_super_root_dir(fs_info->super_copy); di = btrfs_lookup_dir_item(NULL, fs_info->tree_root, path, dir_id, &name, 0); - if (di && !IS_ERR(di)) { + if (!IS_ERR_OR_NULL(di)) { btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key); if (key.objectid == btrfs_root_id(root)) { ret = -EPERM; @@ -5448,7 +5446,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * zero. Make sure any new writes to the file get on disk * on close. */ - if (newsize == 0) + if (newsize == 0 && oldsize != 0) set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); @@ -6859,7 +6857,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, } } else { ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 0, BTRFS_I(inode)->dir_index); + false, BTRFS_I(inode)->dir_index); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto discard; @@ -7075,7 +7073,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, inode_set_ctime_current(inode); ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &fname.disk_name, 1, index); + &fname.disk_name, true, index); if (ret) goto fail; @@ -8173,7 +8171,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) if (!freespace_inode) btrfs_lockdep_acquire(root->fs_info, btrfs_ordered_extent); - btrfs_remove_ordered_extent(inode, ordered); + btrfs_remove_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -8495,14 +8493,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - new_name, 0, old_idx); + new_name, false, old_idx); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; } ret = btrfs_add_link(trans, BTRFS_I(old_dir), BTRFS_I(new_inode), - old_name, 0, new_idx); + old_name, false, new_idx); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -8793,7 +8791,7 @@ static int btrfs_rename(struct mnt_idmap *idmap, } ret = btrfs_add_link(trans, BTRFS_I(new_dir), BTRFS_I(old_inode), - &new_fname.disk_name, 0, index); + &new_fname.disk_name, false, index); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto out_fail; @@ -8978,7 +8976,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_conte { struct btrfs_fs_info *fs_info = root->fs_info; - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) return -EROFS; return start_delalloc_inodes(root, NULL, true, in_reclaim_context); } @@ -8991,7 +8989,7 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, LIST_HEAD(splice); int ret; - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) return -EROFS; mutex_lock(&fs_info->delalloc_root_mutex); @@ -9986,7 +9984,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, size_t bytes = min(min_folio_size, iov_iter_count(from)); char *kaddr; - folio = btrfs_alloc_compr_folio(fs_info); + folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (!folio) { ret = -ENOMEM; goto out_cb; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index d75d31b606e4..b2e447f5005c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2897,7 +2897,7 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, return -ENOMEM; space_args.total_spaces = 0; - dest = kmalloc(alloc_size, GFP_KERNEL); + dest = kzalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; @@ -2953,7 +2953,8 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, user_dest = (struct btrfs_ioctl_space_info __user *) (arg + sizeof(struct btrfs_ioctl_space_args)); - if (copy_to_user(user_dest, dest_orig, alloc_size)) + if (copy_to_user(user_dest, dest_orig, + space_args.total_spaces * sizeof(*dest_orig))) return -EFAULT; out: @@ -3038,7 +3039,7 @@ static long btrfs_ioctl_scrub(struct file *file, void __user *arg) ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end, &sa->progress, sa->flags & BTRFS_SCRUB_READONLY, - 0); + false); /* * Copy scrub args to user space even if btrfs_scrub_dev() returned an @@ -3928,7 +3929,7 @@ static long _btrfs_ioctl_set_received_subvol(struct file *file, ret = btrfs_uuid_tree_add(trans, sa->uuid, BTRFS_UUID_KEY_RECEIVED_SUBVOL, btrfs_root_id(root)); - if (unlikely(ret < 0 && ret != -EEXIST)) { + if (unlikely(ret < 0)) { btrfs_abort_transaction(trans, ret); btrfs_end_transaction(trans); goto out; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 79642e02181b..2de18c7b563a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -106,22 +106,6 @@ fail: return ERR_PTR(-ENOMEM); } -static inline void write_compress_length(char *buf, size_t len) -{ - __le32 dlen; - - dlen = cpu_to_le32(len); - memcpy(buf, &dlen, LZO_LEN); -} - -static inline size_t read_compress_length(const char *buf) -{ - __le32 dlen; - - memcpy(&dlen, buf, LZO_LEN); - return le32_to_cpu(dlen); -} - /* * Write data into @out_folio and queue it into @out_bio. * @@ -218,14 +202,14 @@ static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, ASSERT((old_size >> sectorsize_bits) == (old_size + LZO_LEN - 1) >> sectorsize_bits); if (!*out_folio) { - *out_folio = btrfs_alloc_compr_folio(fs_info); + *out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (!*out_folio) return -ENOMEM; } /* Write the segment header first. */ kaddr = kmap_local_folio(*out_folio, offset_in_folio(*out_folio, *total_out)); - write_compress_length(kaddr, compressed_size); + put_unaligned_le32(compressed_size, kaddr); kunmap_local(kaddr); ret = write_and_queue_folio(out_bio, out_folio, total_out, LZO_LEN); if (ret < 0) @@ -245,7 +229,7 @@ static int copy_compressed_data_to_bio(struct btrfs_fs_info *fs_info, return -E2BIG; if (!*out_folio) { - *out_folio = btrfs_alloc_compr_folio(fs_info); + *out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (!*out_folio) return -ENOMEM; } @@ -296,7 +280,7 @@ int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) ASSERT(bio->bi_iter.bi_size == 0); ASSERT(len); - folio_out = btrfs_alloc_compr_folio(fs_info); + folio_out = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (!folio_out) return -ENOMEM; @@ -362,7 +346,7 @@ int lzo_compress_bio(struct list_head *ws, struct compressed_bio *cb) /* Store the size of all chunks of compressed data */ sizes_ptr = kmap_local_folio(bio_first_folio_all(bio), 0); - write_compress_length(sizes_ptr, total_out); + put_unaligned_le32(total_out, sizes_ptr); kunmap_local(sizes_ptr); out: /* @@ -431,6 +415,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct workspace *workspace = list_entry(ws, struct workspace, list); struct btrfs_fs_info *fs_info = cb->bbio.inode->root->fs_info; const u32 sectorsize = fs_info->sectorsize; + const u32 compressed_len = bio_get_size(&cb->bbio.bio); struct folio_iter fi; char *kaddr; int ret; @@ -449,7 +434,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) return -EINVAL; ASSERT(folio_size(fi.folio) == btrfs_min_folio_size(fs_info)); kaddr = kmap_local_folio(fi.folio, 0); - len_in = read_compress_length(kaddr); + len_in = get_unaligned_le32(kaddr); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -460,14 +445,14 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) * and all sectors should be used. * If this happens, it means the compressed extent is corrupted. */ - if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, cb->compressed_len) || - round_up(len_in, sectorsize) < cb->compressed_len)) { + if (unlikely(len_in > min_t(size_t, BTRFS_MAX_COMPRESSED, compressed_len) || + round_up(len_in, sectorsize) < compressed_len)) { struct btrfs_inode *inode = cb->bbio.inode; btrfs_err(fs_info, "lzo header invalid, root %llu inode %llu offset %llu lzo len %u compressed len %u", btrfs_root_id(inode->root), btrfs_ino(inode), - cb->start, len_in, cb->compressed_len); + cb->start, len_in, compressed_len); return -EUCLEAN; } @@ -488,7 +473,7 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) cur_folio = get_current_folio(cb, &fi, &cur_folio_index, cur_in); ASSERT(cur_folio); kaddr = kmap_local_folio(cur_folio, 0); - seg_len = read_compress_length(kaddr + offset_in_folio(cur_folio, cur_in)); + seg_len = get_unaligned_le32(kaddr + offset_in_folio(cur_folio, cur_in)); kunmap_local(kaddr); cur_in += LZO_LEN; @@ -559,12 +544,12 @@ int lzo_decompress(struct list_head *ws, const u8 *data_in, if (unlikely(srclen < LZO_LEN || srclen > max_segment_len + LZO_LEN * 2)) return -EUCLEAN; - in_len = read_compress_length(data_in); + in_len = get_unaligned_le32(data_in); if (unlikely(in_len != srclen)) return -EUCLEAN; data_in += LZO_LEN; - in_len = read_compress_length(data_in); + in_len = get_unaligned_le32(data_in); if (unlikely(in_len != srclen - LZO_LEN * 2)) return -EUCLEAN; data_in += LZO_LEN; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index 6190777924bf..7c60c14e60fa 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -37,7 +37,7 @@ static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf) memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE)); curr += sizeof(STATE_STRING_PREFACE) - 1; - if (BTRFS_FS_ERROR(info)) { + if (unlikely(BTRFS_FS_ERROR(info))) { *curr++ = 'E'; states_printed = true; } diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index c8e92efce405..556d4e79cde6 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -144,11 +144,11 @@ do { \ verify_assert_printk_format("check the format string" args); \ if (!likely(cond)) { \ if (("" __FIRST_ARG(args) [0]) == 0) { \ - pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ - #cond, (long)(cond), __FILE__, __LINE__); \ + pr_err("assertion failed: %s, in %s:%d\n", \ + #cond, __FILE__, __LINE__); \ } else { \ - pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \ - #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \ + pr_err("assertion failed: %s, in %s:%d (" __FIRST_ARG(args) ")\n", \ + #cond, __FILE__, __LINE__ __REST_ARGS(args)); \ } \ BUG(); \ } \ diff --git a/fs/btrfs/misc.h b/fs/btrfs/misc.h index 12c5a9d6564f..694be6d0562a 100644 --- a/fs/btrfs/misc.h +++ b/fs/btrfs/misc.h @@ -28,7 +28,8 @@ name = (1U << __ ## name ## _BIT), \ __ ## name ## _SEQ = __ ## name ## _BIT -static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) +static inline phys_addr_t bio_iter_phys(const struct bio *bio, + const struct bvec_iter *iter) { struct bio_vec bv = bio_iter_iovec(bio, *iter); @@ -52,15 +53,22 @@ static inline phys_addr_t bio_iter_phys(struct bio *bio, struct bvec_iter *iter) (paddr = bio_iter_phys((bio), (iter)), 1); \ bio_advance_iter_single((bio), (iter), (blocksize))) -/* Initialize a bvec_iter to the size of the specified bio. */ -static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) +/* Can only be called on a non-cloned bio. */ +static inline u32 bio_get_size(struct bio *bio) { struct bio_vec *bvec; - u32 bio_size = 0; + u32 ret = 0; int i; bio_for_each_bvec_all(bvec, bio, i) - bio_size += bvec->bv_len; + ret += bvec->bv_len; + return ret; +} + +/* Initialize a bvec_iter to the size of the specified bio. */ +static inline struct bvec_iter init_bvec_iter_for_bio(struct bio *bio) +{ + const u32 bio_size = bio_get_size(bio); return (struct bvec_iter) { .bi_sector = 0, diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5df02c707aee..e5a24b3ff95e 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -156,6 +156,19 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( const bool is_nocow = (flags & ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); + /* Only one type flag can be set. */ + ASSERT(has_single_bit_set(flags & BTRFS_ORDERED_EXCLUSIVE_FLAGS)); + + /* DIRECT cannot be set with COMPRESSED nor ENCODED. */ + if (test_bit(BTRFS_ORDERED_DIRECT, &flags)) { + ASSERT(!test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); + ASSERT(!test_bit(BTRFS_ORDERED_ENCODED, &flags)); + } + + /* ENCODED must be set with COMPRESSED. */ + if (test_bit(BTRFS_ORDERED_ENCODED, &flags)) + ASSERT(test_bit(BTRFS_ORDERED_COMPRESSED, &flags)); + /* * For a NOCOW write we can free the qgroup reserve right now. For a COW * one we transfer the reserved space from the inode's iotree into the @@ -197,7 +210,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->flags = flags; refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); - INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->csum_list); INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); @@ -240,10 +253,15 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) spin_lock(&inode->ordered_tree_lock); node = tree_insert(&inode->ordered_tree, entry->file_offset, &entry->rb_node); - if (unlikely(node)) + if (unlikely(node)) { + struct btrfs_ordered_extent *exist = + rb_entry(node, struct btrfs_ordered_extent, rb_node); + btrfs_panic(fs_info, -EEXIST, - "inconsistency in ordered tree at offset %llu", - entry->file_offset); +"overlapping ordered extents, existing oe file_offset %llu num_bytes %llu flags 0x%lx, new oe file_offset %llu num_bytes %llu flags 0x%lx", + exist->file_offset, exist->num_bytes, exist->flags, + entry->file_offset, entry->num_bytes, entry->flags); + } spin_unlock(&inode->ordered_tree_lock); spin_lock(&root->ordered_extent_lock); @@ -329,7 +347,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_inode *inode = entry->inode; spin_lock(&inode->ordered_tree_lock); - list_add_tail(&sum->list, &entry->list); + list_add_tail(&sum->list, &entry->csum_list); spin_unlock(&inode->ordered_tree_lock); } @@ -348,30 +366,13 @@ static void finish_ordered_fn(struct btrfs_work *work) } static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct folio *folio, u64 file_offset, - u64 len, bool uptodate) + u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; lockdep_assert_held(&inode->ordered_tree_lock); - if (folio) { - ASSERT(folio->mapping); - ASSERT(folio_pos(folio) <= file_offset); - ASSERT(file_offset + len <= folio_next_pos(folio)); - - /* - * Ordered flag indicates whether we still have - * pending io unfinished for the ordered extent. - * - * If it's not set, we need to skip to next range. - */ - if (!btrfs_folio_test_ordered(fs_info, folio, file_offset, len)) - return false; - btrfs_folio_clear_ordered(fs_info, folio, file_offset, len); - } - /* Now we're fine to update the accounting. */ if (WARN_ON_ONCE(len > ordered->bytes_left)) { btrfs_crit(fs_info, @@ -385,7 +386,7 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, } if (!uptodate) - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + btrfs_mark_ordered_extent_error(ordered); if (ordered->bytes_left) return false; @@ -413,8 +414,7 @@ static void btrfs_queue_ordered_fn(struct btrfs_ordered_extent *ordered) } void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct folio *folio, u64 file_offset, u64 len, - bool uptodate) + u64 file_offset, u64 len, bool uptodate) { struct btrfs_inode *inode = ordered->inode; bool ret; @@ -422,7 +422,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, trace_btrfs_finish_ordered_extent(inode, file_offset, len, uptodate); spin_lock(&inode->ordered_tree_lock); - ret = can_finish_ordered_extent(ordered, folio, file_offset, len, + ret = can_finish_ordered_extent(ordered, file_offset, len, uptodate); spin_unlock(&inode->ordered_tree_lock); @@ -475,8 +475,7 @@ void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct folio *folio, u64 file_offset, - u64 num_bytes, bool uptodate) + u64 file_offset, u64 num_bytes, bool uptodate) { struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; @@ -536,7 +535,7 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, len = this_end - cur; ASSERT(len < U32_MAX); - if (can_finish_ordered_extent(entry, folio, cur, len, uptodate)) { + if (can_finish_ordered_extent(entry, cur, len, uptodate)) { spin_unlock(&inode->ordered_tree_lock); btrfs_queue_ordered_fn(entry); spin_lock(&inode->ordered_tree_lock); @@ -628,7 +627,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); btrfs_add_delayed_iput(entry->inode); - list_for_each_entry_safe(sum, tmp, &entry->list, list) + list_for_each_entry_safe(sum, tmp, &entry->csum_list, list) kvfree(sum); kmem_cache_free(btrfs_ordered_extent_cache, entry); } @@ -638,9 +637,9 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, - struct btrfs_ordered_extent *entry) +void btrfs_remove_ordered_extent(struct btrfs_ordered_extent *entry) { + struct btrfs_inode *btrfs_inode = entry->inode; struct btrfs_root *root = btrfs_inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; @@ -1323,10 +1322,10 @@ struct btrfs_ordered_extent *btrfs_split_ordered_extent( } } - list_for_each_entry_safe(sum, tmpsum, &ordered->list, list) { + list_for_each_entry_safe(sum, tmpsum, &ordered->csum_list, list) { if (offset == len) break; - list_move_tail(&sum->list, &new->list); + list_move_tail(&sum->list, &new->csum_list); offset += sum->len; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 1e6b0b182b29..03e12380a2fd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -47,8 +47,25 @@ struct btrfs_ordered_sum { * IO is done and any metadata is inserted into the tree. */ enum { + /* Extra status bits for ordered extents */ + + /* Set when all the pages are written. */ + BTRFS_ORDERED_IO_DONE, + /* Set when removed from the tree. */ + BTRFS_ORDERED_COMPLETE, + /* We had an io error when writing this out. */ + BTRFS_ORDERED_IOERR, + /* Set when we have to truncate an extent. */ + BTRFS_ORDERED_TRUNCATED, + /* Used during fsync to track already logged extents. */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent. */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction. */ + BTRFS_ORDERED_PENDING, + /* - * Different types for ordered extents, one and only one of the 4 types + * Different types for ordered extents, one and only one of these types * need to be set when creating ordered extent. * * REGULAR: For regular non-compressed COW write @@ -61,37 +78,27 @@ enum { BTRFS_ORDERED_PREALLOC, BTRFS_ORDERED_COMPRESSED, + /* Extra bit for encoded write, must be set with COMPRESSED. */ + BTRFS_ORDERED_ENCODED, + /* * Extra bit for direct io, can only be set for - * REGULAR/NOCOW/PREALLOC. No direct io for compressed extent. + * REGULAR/NOCOW/PREALLOC. Must not be set for COMPRESSED nor ENCODED. */ BTRFS_ORDERED_DIRECT, - /* Extra status bits for ordered extents */ - - /* set when all the pages are written */ - BTRFS_ORDERED_IO_DONE, - /* set when removed from the tree */ - BTRFS_ORDERED_COMPLETE, - /* We had an io error when writing this out */ - BTRFS_ORDERED_IOERR, - /* Set when we have to truncate an extent */ - BTRFS_ORDERED_TRUNCATED, - /* Used during fsync to track already logged extents */ - BTRFS_ORDERED_LOGGED, - /* We have already logged all the csums of the ordered extent */ - BTRFS_ORDERED_LOGGED_CSUM, - /* We wait for this extent to complete in the current transaction */ - BTRFS_ORDERED_PENDING, - /* BTRFS_IOC_ENCODED_WRITE */ - BTRFS_ORDERED_ENCODED, + BTRFS_ORDERED_NR_FLAGS, }; +static_assert(BTRFS_ORDERED_NR_FLAGS <= BITS_PER_LONG); + +/* One and only one flag can be set. */ +#define BTRFS_ORDERED_EXCLUSIVE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ + (1UL << BTRFS_ORDERED_NOCOW) | \ + (1UL << BTRFS_ORDERED_PREALLOC) | \ + (1UL << BTRFS_ORDERED_COMPRESSED)) /* BTRFS_ORDERED_* flags that specify the type of the extent. */ -#define BTRFS_ORDERED_TYPE_FLAGS ((1UL << BTRFS_ORDERED_REGULAR) | \ - (1UL << BTRFS_ORDERED_NOCOW) | \ - (1UL << BTRFS_ORDERED_PREALLOC) | \ - (1UL << BTRFS_ORDERED_COMPRESSED) | \ +#define BTRFS_ORDERED_TYPE_FLAGS (BTRFS_ORDERED_EXCLUSIVE_FLAGS | \ (1UL << BTRFS_ORDERED_DIRECT) | \ (1UL << BTRFS_ORDERED_ENCODED)) @@ -134,7 +141,7 @@ struct btrfs_ordered_extent { struct btrfs_inode *inode; /* list of checksums for insertion when the extent io is done */ - struct list_head list; + struct list_head csum_list; /* used for fast fsyncs */ struct list_head log_list; @@ -161,14 +168,11 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent); int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, - struct btrfs_ordered_extent *entry); +void btrfs_remove_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_finish_ordered_extent(struct btrfs_ordered_extent *ordered, - struct folio *folio, u64 file_offset, u64 len, - bool uptodate); + u64 file_offset, u64 len, bool uptodate); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct folio *folio, u64 file_offset, - u64 num_bytes, bool uptodate); + u64 file_offset, u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index b7dfe877cf8d..87e60a2d4bd8 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -626,10 +626,6 @@ void btrfs_print_tree(const struct extent_buffer *c, bool follow) next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check); if (IS_ERR(next)) continue; - if (!extent_buffer_uptodate(next)) { - free_extent_buffer(next); - continue; - } if (btrfs_is_leaf(next) && level != 1) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 41589ce66371..cdf736d3a4e5 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2740,8 +2740,6 @@ static void qgroup_iterator_nested_clean(struct list_head *head) } } -#define UPDATE_NEW 0 -#define UPDATE_OLD 1 /* * Walk all of the roots that points to the bytenr and adjust their refcnts. */ @@ -2980,10 +2978,10 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, seq = fs_info->qgroup_seq; /* Update old refcnts using old_roots */ - qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, UPDATE_OLD); + qgroup_update_refcnt(fs_info, old_roots, &qgroups, seq, true); /* Update new refcnts using new_roots */ - qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, UPDATE_NEW); + qgroup_update_refcnt(fs_info, new_roots, &qgroups, seq, false); qgroup_update_counters(fs_info, &qgroups, nr_old_roots, nr_new_roots, num_bytes, seq); @@ -4326,7 +4324,7 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, u64 freed = 0; int ret; - extent_changeset_init(&changeset); + extent_changeset_init_bytes_only(&changeset); len = round_up(start + len, root->fs_info->sectorsize); start = round_down(start, root->fs_info->sectorsize); @@ -4391,7 +4389,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, WARN_ON(!free && reserved); if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); - extent_changeset_init(&changeset); + extent_changeset_init_bytes_only(&changeset); ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1, EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) @@ -4491,8 +4489,8 @@ static int sub_root_meta_rsv(struct btrfs_root *root, int num_bytes, return num_bytes; } -int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce) +static int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, + enum btrfs_qgroup_rsv_type type, bool enforce) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -4518,20 +4516,21 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return ret; } -int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce, - bool noflush) +int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, int num_bytes, + bool enforce, bool noflush) { int ret; - ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); + ret = btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC, enforce); if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); if (ret < 0) return ret; - return btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); + return btrfs_qgroup_reserve_meta(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC, enforce); } /* @@ -4553,8 +4552,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) BTRFS_QGROUP_RSV_META_PERTRANS); } -void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type) +void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, int num_bytes) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -4567,10 +4565,13 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, * which can lead to underflow. * Here ensure we will only free what we really have reserved. */ - num_bytes = sub_root_meta_rsv(root, num_bytes, type); + num_bytes = sub_root_meta_rsv(root, num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type); - btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); + btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, + BTRFS_QGROUP_RSV_META_PREALLOC); } static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, @@ -4646,6 +4647,7 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { + ASSERT(extent_changeset_tracks_ranges(&changeset)); ULIST_ITER_INIT(&iter); while ((unode = ulist_next(&changeset.range_changed, &iter))) { btrfs_warn(inode->root->fs_info, @@ -4883,10 +4885,6 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, reloc_eb = NULL; goto free_out; } - if (unlikely(!extent_buffer_uptodate(reloc_eb))) { - ret = -EIO; - goto free_out; - } ret = qgroup_trace_subtree_swap(trans, reloc_eb, subvol_eb, block->last_snapshot, block->trace_leaf); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index a979fd59a4da..80dd2dacd56d 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -392,46 +392,10 @@ int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, u64 *freed); -int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce); -int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce, - bool noflush); -/* Reserve metadata space for pertrans and prealloc type */ -static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, - int num_bytes, bool enforce) -{ - return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS, - enforce, false); -} -static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, - int num_bytes, bool enforce, - bool noflush) -{ - return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC, - enforce, noflush); -} - -void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type); - -/* Free per-transaction meta reservation for error handling */ -static inline void btrfs_qgroup_free_meta_pertrans(struct btrfs_root *root, - int num_bytes) -{ - __btrfs_qgroup_free_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS); -} - +int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, int num_bytes, + bool enforce, bool noflush); /* Pre-allocated meta reservation can be freed at need */ -static inline void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, - int num_bytes) -{ - __btrfs_qgroup_free_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC); -} +void btrfs_qgroup_free_meta_prealloc(struct btrfs_root *root, int num_bytes); void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root); void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 2987cb7c686e..638c4ad572c9 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -300,7 +300,7 @@ int btrfs_insert_one_raid_extent(struct btrfs_trans_handle *trans, int ret; stripe_extent = kzalloc(item_size, GFP_NOFS); - if (!unlikely(stripe_extent)) { + if (unlikely(!stripe_extent)) { btrfs_abort_transaction(trans, -ENOMEM); btrfs_end_transaction(trans); return -ENOMEM; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 02105d68accb..e31d57d6ab1e 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1653,12 +1653,7 @@ static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { int total_sector_nr = get_bio_sector_nr(rbio, bio); - u32 bio_size = 0; - struct bio_vec *bvec; - int i; - - bio_for_each_bvec_all(bvec, bio, i) - bio_size += bvec->bv_len; + const u32 bio_size = bio_get_size(bio); /* * Since we can have multiple bios touching the error_bitmap, we cannot @@ -1666,7 +1661,7 @@ static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bi * * Instead use set_bit() for each bit, as set_bit() itself is atomic. */ - for (i = total_sector_nr; i < total_sector_nr + + for (int i = total_sector_nr; i < total_sector_nr + (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++) set_bit(i, rbio->error_bitmap); } @@ -2110,8 +2105,8 @@ static int recover_sectors(struct btrfs_raid_bio *rbio) * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works. */ - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); + unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); if (!pointers || !unmap_array) { ret = -ENOMEM; goto out; @@ -2844,8 +2839,8 @@ static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) * @unmap_array stores copy of pointers that does not get reordered * during reconstruction so that kunmap_local works. */ - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + pointers = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); + unmap_array = kzalloc_objs(void *, rbio->real_stripes, GFP_NOFS); if (!pointers || !unmap_array) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 314cb95ba846..49865a463780 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -322,6 +322,51 @@ copy_to_page: ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); + + /* + * If we copied the inline extent data to a page/folio beyond the i_size + * of the destination inode, then we need to increase the i_size before + * we start a transaction to update the inode item. This is to prevent a + * deadlock when the flushoncommit mount option is used, which happens + * like this: + * + * 1) Task A clones an inline extent from inode X to an offset of inode + * Y that is beyond Y's current i_size. This means we copied the + * inline extent's data to a folio of inode Y that is beyond its EOF, + * using the call above to copy_inline_to_page(); + * + * 2) Task B starts a transaction commit and calls + * btrfs_start_delalloc_flush() to flush delalloc; + * + * 3) The delalloc flushing sees the new dirty folio of inode Y and when + * it attempts to flush it, it ends up at extent_writepage() and sees + * that the offset of the folio is beyond the i_size of inode Y, so + * it attempts to invalidate the folio by calling folio_invalidate(), + * which ends up at btrfs' folio invalidate callback - + * btrfs_invalidate_folio(). There it tries to lock the folio's range + * in inode Y's extent io tree, but it blocks since it's currently + * locked by task A - during reflink we lock the inodes and the + * source and destination ranges after flushing all delalloc and + * waiting for ordered extent completion - after that we don't expect + * to have dirty folios in the ranges, the exception is if we have to + * copy an inline extent's data (because the destination offset is + * not zero); + * + * 4) Task A then does the 'goto out' below and attempts to start a + * transaction to update the inode item, and then it's blocked since + * the current transaction is in the TRANS_STATE_COMMIT_START state. + * Therefore task A has to wait for the current transaction to become + * unblocked (its state >= TRANS_STATE_UNBLOCKED). + * + * This leads to a deadlock - the task committing the transaction + * waiting for the delalloc flushing which is blocked during folio + * invalidation on the inode's extent lock and the reflink task waiting + * for the current transaction to be unblocked so that it can start a + * a new one to update the inode item (while holding the extent lock). + */ + if (ret == 0 && new_key->offset + datal > i_size_read(&inode->vfs_inode)) + i_size_write(&inode->vfs_inode, new_key->offset + datal); + goto out; } @@ -646,7 +691,7 @@ static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, */ btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, - ALIGN(len, bs), dst_loff, 1); + ALIGN(len, bs), dst_loff, true); btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -747,7 +792,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, */ end = destoff + len - 1; btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); - ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); + ret = btrfs_clone(src, inode, off, olen, len, destoff, false); btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); if (ret < 0) return ret; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 033f74fd6225..1c42c5180bdd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2440,10 +2440,7 @@ static int get_tree_block_key(struct btrfs_fs_info *fs_info, eb = read_tree_block(fs_info, block->bytenr, &check); if (IS_ERR(eb)) return PTR_ERR(eb); - if (unlikely(!extent_buffer_uptodate(eb))) { - free_extent_buffer(eb); - return -EIO; - } + if (block->level == 0) btrfs_item_key_to_cpu(eb, &block->key, 0); else @@ -3645,12 +3642,7 @@ restart: btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL); /* get rid of pinned extents */ - trans = btrfs_join_transaction(rc->extent_root); - if (IS_ERR(trans)) { - err = PTR_ERR(trans); - goto out_free; - } - ret = btrfs_commit_transaction(trans); + ret = btrfs_commit_current_transaction(rc->extent_root); if (ret && !err) err = ret; out_free: diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index bc94bbc00772..1ac609239cbe 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -891,16 +891,11 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - struct bio_vec *bvec; int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); - u32 bio_size = 0; - int i; + const u32 bio_size = bio_get_size(&bbio->bio); ASSERT(sector_nr < stripe->nr_sectors); - bio_for_each_bvec_all(bvec, &bbio->bio, i) - bio_size += bvec->bv_len; - if (bbio->bio.bi_status) { scrub_bitmap_set_io_error(stripe, sector_nr, bio_size >> fs_info->sectorsize_bits); @@ -1249,15 +1244,11 @@ out: static void scrub_read_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; - struct bio_vec *bvec; int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); int num_sectors; - u32 bio_size = 0; - int i; + const u32 bio_size = bio_get_size(&bbio->bio); ASSERT(sector_nr < stripe->nr_sectors); - bio_for_each_bvec_all(bvec, &bbio->bio, i) - bio_size += bvec->bv_len; num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { @@ -1278,13 +1269,8 @@ static void scrub_write_endio(struct btrfs_bio *bbio) { struct scrub_stripe *stripe = bbio->private; struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - struct bio_vec *bvec; int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); - u32 bio_size = 0; - int i; - - bio_for_each_bvec_all(bvec, &bbio->bio, i) - bio_size += bvec->bv_len; + const u32 bio_size = bio_get_size(&bbio->bio); if (bbio->bio.bi_status) { unsigned long flags; @@ -1293,7 +1279,7 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); - for (i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); } @@ -2988,7 +2974,7 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, struct page *page; struct btrfs_fs_info *fs_info = sctx->fs_info; - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) return -EROFS; page = alloc_page(GFP_KERNEL); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 904a2f57f86d..89d72d8cb85f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7201,7 +7201,7 @@ static int changed_cb(struct btrfs_path *left_path, sctx->right_path = right_path; sctx->cmp_key = key; - ret = finish_inode_if_needed(sctx, 0); + ret = finish_inode_if_needed(sctx, false); if (ret < 0) return ret; @@ -7328,7 +7328,7 @@ static int full_send_tree(struct send_ctx *sctx) } out_finish: - return finish_inode_if_needed(sctx, 1); + return finish_inode_if_needed(sctx, true); } static int replace_node_with_clone(struct btrfs_path *path, int level) @@ -7879,7 +7879,7 @@ static int send_subvol(struct send_ctx *sctx) ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; - ret = finish_inode_if_needed(sctx, 1); + ret = finish_inode_if_needed(sctx, true); if (ret < 0) goto out; } else { diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 87cbc051cb12..f0436eea1544 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -129,6 +129,15 @@ * churn a lot and we can avoid making some extent tree modifications if we * are able to delay for as long as possible. * + * RECLAIM_ZONES + * This state only works for the zoned mode. In zoned mode, we cannot reuse + * regions that have once been allocated and then been freed until we reset + * the zone, due to the sequential write requirement. The RECLAIM_ZONES state + * calls the reclaim machinery, evacuating the still valid data in these + * block-groups and relocates it to the data_reloc_bg. Afterwards these + * block-groups get deleted and the transaction is committed. This frees up + * space to use for new allocations. + * * RESET_ZONES * This state works only for the zoned mode. On the zoned mode, we cannot * reuse once allocated then freed region until we reset the zone, due to @@ -203,6 +212,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) #define BTRFS_UNALLOC_BLOCK_GROUP_TARGET (10ULL) +#define BTRFS_ZONED_SYNC_RECLAIM_BATCH (5) + /* * Calculate chunk size depending on volume type (regular or zoned). */ @@ -276,10 +287,8 @@ static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flag sub_group->subgroup_id = id; ret = btrfs_sysfs_add_space_info_type(sub_group); - if (ret) { - kfree(sub_group); + if (ret) parent->sub_group[index] = NULL; - } return ret; } @@ -311,7 +320,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) ret = btrfs_sysfs_add_space_info_type(space_info); if (ret) - goto out_free; + return ret; list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) @@ -403,10 +412,10 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, up_write(&space_info->groups_sem); } -struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, +struct btrfs_space_info *btrfs_find_space_info(const struct btrfs_fs_info *info, u64 flags) { - struct list_head *head = &info->space_info; + const struct list_head *head = &info->space_info; struct btrfs_space_info *found; flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; @@ -418,7 +427,7 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } -static u64 calc_effective_data_chunk_size(struct btrfs_fs_info *fs_info) +static u64 calc_effective_data_chunk_size(const struct btrfs_fs_info *fs_info) { struct btrfs_space_info *data_sinfo; u64 data_chunk_size; @@ -444,6 +453,7 @@ static u64 calc_available_free_space(const struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) { struct btrfs_fs_info *fs_info = space_info->fs_info; + bool has_per_profile; u64 profile; u64 avail; u64 data_chunk_size; @@ -454,19 +464,21 @@ static u64 calc_available_free_space(const struct btrfs_space_info *space_info, else profile = btrfs_metadata_alloc_profile(fs_info); - avail = atomic64_read(&fs_info->free_chunk_space); - - /* - * If we have dup, raid1 or raid10 then only half of the free - * space is actually usable. For raid56, the space info used - * doesn't include the parity drive, so we don't have to - * change the math - */ - factor = btrfs_bg_type_to_factor(profile); - avail = div_u64(avail, factor); - if (avail == 0) - return 0; + has_per_profile = btrfs_get_per_profile_avail(fs_info, profile, &avail); + if (!has_per_profile) { + avail = atomic64_read(&fs_info->free_chunk_space); + /* + * If we have dup, raid1 or raid10 then only half of the free + * space is actually usable. For raid56, the space info used + * doesn't include the parity drive, so we don't have to + * change the math + */ + factor = btrfs_bg_type_to_factor(profile); + avail = div_u64(avail, factor); + if (avail == 0) + return 0; + } data_chunk_size = calc_effective_data_chunk_size(fs_info); /* @@ -489,10 +501,10 @@ static u64 calc_available_free_space(const struct btrfs_space_info *space_info, /* * If we aren't flushing all things, let us overcommit up to * 1/2th of the space. If we can flush, don't let us overcommit - * too much, let it overcommit up to 1/8 of the space. + * too much, let it overcommit up to 1/64th of the space. */ - if (flush == BTRFS_RESERVE_FLUSH_ALL) - avail >>= 3; + if (flush == BTRFS_RESERVE_FLUSH_ALL || flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) + avail >>= 6; else avail >>= 1; @@ -902,6 +914,18 @@ static void flush_space(struct btrfs_space_info *space_info, u64 num_bytes, if (ret > 0 || ret == -ENOSPC) ret = 0; break; + case RECLAIM_ZONES: + if (btrfs_is_zoned(fs_info)) { + btrfs_reclaim_sweep(fs_info); + btrfs_delete_unused_bgs(fs_info); + btrfs_reclaim_block_groups(fs_info, + BTRFS_ZONED_SYNC_RECLAIM_BATCH); + ASSERT(current->journal_info == NULL); + ret = btrfs_commit_current_transaction(root); + } else { + ret = 0; + } + break; case RUN_DELAYED_IPUTS: /* * If we have pending delayed iputs then we could free up a @@ -1400,6 +1424,7 @@ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_FULL, RUN_DELAYED_IPUTS, COMMIT_TRANS, + RECLAIM_ZONES, RESET_ZONES, ALLOC_CHUNK_FORCE, }; diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index 0703f24b23f7..24f45072ca4b 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -21,7 +21,24 @@ struct btrfs_block_group; * The higher the level, the more methods we try to reclaim space. */ enum btrfs_reserve_flush_enum { - /* If we are in the transaction, we can't flush anything.*/ + /* + * Used when we can't flush or don't need: + * + * 1) We are holding a transaction handle open, so we can't flush as + * that could deadlock. + * + * 2) For a nowait write we don't want to block when reserving delalloc. + * + * 3) Joining a transaction or attaching a transaction, we don't want + * to wait and we don't need to reserve anything (any needed space + * was reserved before in a dedicated block reserve, or we rely on + * the global block reserve, see btrfs_init_root_block_rsv()). + * + * 4) Starting a transaction when we don't need to reserve space, as + * we don't need it because we previously reserved in a dedicated + * block reserve or rely on the global block reserve, like the above + * case. + */ BTRFS_RESERVE_NO_FLUSH, /* @@ -96,6 +113,7 @@ enum btrfs_flush_state { RUN_DELAYED_IPUTS = 10, COMMIT_TRANS = 11, RESET_ZONES = 12, + RECLAIM_ZONES = 13, }; enum btrfs_space_info_sub_group { @@ -274,7 +292,7 @@ void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group); void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, u64 chunk_size); -struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, +struct btrfs_space_info *btrfs_find_space_info(const struct btrfs_fs_info *info, u64 flags); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); void btrfs_dump_space_info(struct btrfs_space_info *info, u64 bytes, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b4d26ca9220a..b26aa9169e83 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1299,7 +1299,7 @@ static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) { int ret; - if (BTRFS_FS_ERROR(fs_info)) { + if (unlikely(BTRFS_FS_ERROR(fs_info))) { btrfs_err(fs_info, "remounting read-write after error is not allowed"); return -EINVAL; @@ -2423,7 +2423,6 @@ static long btrfs_free_cached_objects(struct super_block *sb, struct shrink_cont return 0; } -#ifdef CONFIG_BTRFS_EXPERIMENTAL static int btrfs_remove_bdev(struct super_block *sb, struct block_device *bdev) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -2481,7 +2480,6 @@ static void btrfs_shutdown(struct super_block *sb) btrfs_force_shutdown(fs_info); } -#endif static int btrfs_show_stats(struct seq_file *seq, struct dentry *root) { @@ -2511,10 +2509,8 @@ static const struct super_operations btrfs_super_ops = { .nr_cached_objects = btrfs_nr_cached_objects, .free_cached_objects = btrfs_free_cached_objects, .show_stats = btrfs_show_stats, -#ifdef CONFIG_BTRFS_EXPERIMENTAL .remove_bdev = btrfs_remove_bdev, .shutdown = btrfs_shutdown, -#endif }; static const struct file_operations btrfs_ctl_fops = { diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index d80a86acfbbe..f85f8a8a7bfe 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -18,7 +18,7 @@ char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info); -static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) +static inline struct btrfs_fs_info *btrfs_sb(const struct super_block *sb) { return sb->s_fs_info; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 90e50e62dd17..19c127ac6d10 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -303,6 +303,9 @@ int btrfs_run_sanity_tests(void) } } ret = btrfs_test_extent_map(); + if (ret) + goto out; + ret = btrfs_test_zoned(); out: btrfs_destroy_test_fs(); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index b03d85a6e5ef..cea58fe84a6d 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -63,6 +63,16 @@ void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); void btrfs_init_dummy_transaction(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info); + +#ifdef CONFIG_BLK_DEV_ZONED +int btrfs_test_zoned(void); +#else +static inline int btrfs_test_zoned(void) +{ + return 0; +} +#endif + #else static inline int btrfs_run_sanity_tests(void) { diff --git a/fs/btrfs/tests/zoned-tests.c b/fs/btrfs/tests/zoned-tests.c new file mode 100644 index 000000000000..2bc3b14baa41 --- /dev/null +++ b/fs/btrfs/tests/zoned-tests.c @@ -0,0 +1,675 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2026 Western Digital. All rights reserved. + */ + +#include +#include + +#include "btrfs-tests.h" +#include "../space-info.h" +#include "../volumes.h" +#include "../zoned.h" + +#define WP_MISSING_DEV ((u64)-1) +#define WP_CONVENTIONAL ((u64)-2) +#define ZONE_SIZE SZ_256M + +#define HALF_STRIPE_LEN (BTRFS_STRIPE_LEN >> 1) + +struct load_zone_info_test_vector { + u64 raid_type; + u64 num_stripes; + u64 alloc_offsets[8]; + u64 last_alloc; + u64 bg_length; + bool degraded; + + int expected_result; + u64 expected_alloc_offset; + + const char *description; +}; + +struct zone_info { + u64 physical; + u64 capacity; + u64 alloc_offset; +}; + +static int test_load_zone_info(struct btrfs_fs_info *fs_info, + const struct load_zone_info_test_vector *test) +{ + struct btrfs_block_group *bg __free(btrfs_free_dummy_block_group) = NULL; + struct btrfs_chunk_map *map __free(btrfs_free_chunk_map) = NULL; + struct zone_info AUTO_KFREE(zone_info); + unsigned long AUTO_KFREE(active); + int ret; + + bg = btrfs_alloc_dummy_block_group(fs_info, test->bg_length); + if (!bg) { + test_std_err(TEST_ALLOC_BLOCK_GROUP); + return -ENOMEM; + } + + map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL); + if (!map) { + test_std_err(TEST_ALLOC_EXTENT_MAP); + return -ENOMEM; + } + + zone_info = kzalloc_objs(*zone_info, test->num_stripes, GFP_KERNEL); + if (!zone_info) { + test_err("cannot allocate zone info"); + return -ENOMEM; + } + + active = bitmap_zalloc(test->num_stripes, GFP_KERNEL); + if (!zone_info) { + test_err("cannot allocate active bitmap"); + return -ENOMEM; + } + + map->type = test->raid_type; + map->num_stripes = test->num_stripes; + if (test->raid_type == BTRFS_BLOCK_GROUP_RAID10) + map->sub_stripes = 2; + for (int i = 0; i < test->num_stripes; i++) { + zone_info[i].physical = 0; + zone_info[i].alloc_offset = test->alloc_offsets[i]; + zone_info[i].capacity = ZONE_SIZE; + if (zone_info[i].alloc_offset && zone_info[i].alloc_offset < ZONE_SIZE) + __set_bit(i, active); + } + if (test->degraded) + btrfs_set_opt(fs_info->mount_opt, DEGRADED); + else + btrfs_clear_opt(fs_info->mount_opt, DEGRADED); + + ret = btrfs_load_block_group_by_raid_type(bg, map, zone_info, active, + test->last_alloc); + + if (ret != test->expected_result) { + test_err("unexpected return value: ret %d expected %d", ret, + test->expected_result); + return -EINVAL; + } + + if (!ret && bg->alloc_offset != test->expected_alloc_offset) { + test_err("unexpected alloc_offset: alloc_offset %llu expected %llu", + bg->alloc_offset, test->expected_alloc_offset); + return -EINVAL; + } + + return 0; +} + +static const struct load_zone_info_test_vector load_zone_info_tests[] = { + /* SINGLE */ + { + .description = "SINGLE: load write pointer from sequential zone", + .raid_type = 0, + .num_stripes = 1, + .alloc_offsets = { + SZ_1M, + }, + .expected_alloc_offset = SZ_1M, + }, + /* + * SINGLE block group on a conventional zone sets last_alloc outside of + * btrfs_load_block_group_*(). Do not test that case. + */ + + /* DUP */ + /* Normal case */ + { + .description = "DUP: having matching write pointers", + .raid_type = BTRFS_BLOCK_GROUP_DUP, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, SZ_1M, + }, + .expected_alloc_offset = SZ_1M, + }, + /* + * One sequential zone and one conventional zone, having matching + * last_alloc. + */ + { + .description = "DUP: seq zone and conv zone, matching last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_DUP, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = SZ_1M, + .expected_alloc_offset = SZ_1M, + }, + /* + * One sequential and one conventional zone, but having smaller + * last_alloc than write pointer. + */ + { + .description = "DUP: seq zone and conv zone, smaller last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_DUP, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = 0, + .expected_alloc_offset = SZ_1M, + }, + /* Error case: having different write pointers. */ + { + .description = "DUP: fail: different write pointers", + .raid_type = BTRFS_BLOCK_GROUP_DUP, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, SZ_2M, + }, + .expected_result = -EIO, + }, + /* Error case: partial missing device should not happen on DUP. */ + { + .description = "DUP: fail: missing device", + .raid_type = BTRFS_BLOCK_GROUP_DUP, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_MISSING_DEV, + }, + .expected_result = -EIO, + }, + /* + * Error case: one sequential and one conventional zone, but having larger + * last_alloc than write pointer. + */ + { + .description = "DUP: fail: seq zone and conv zone, larger last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_DUP, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = SZ_2M, + .expected_result = -EIO, + }, + + /* RAID1 */ + /* Normal case */ + { + .description = "RAID1: having matching write pointers", + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, SZ_1M, + }, + .expected_alloc_offset = SZ_1M, + }, + /* + * One sequential zone and one conventional zone, having matching + * last_alloc. + */ + { + .description = "RAID1: seq zone and conv zone, matching last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = SZ_1M, + .expected_alloc_offset = SZ_1M, + }, + /* + * One sequential and one conventional zone, but having smaller + * last_alloc than write pointer. + */ + { + .description = "RAID1: seq zone and conv zone, smaller last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = 0, + .expected_alloc_offset = SZ_1M, + }, + /* Partial missing device should be recovered on DEGRADED mount */ + { + .description = "RAID1: fail: missing device on DEGRADED", + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_MISSING_DEV, + }, + .degraded = true, + .expected_alloc_offset = SZ_1M, + }, + /* Error case: having different write pointers. */ + { + .description = "RAID1: fail: different write pointers", + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, SZ_2M, + }, + .expected_result = -EIO, + }, + /* + * Partial missing device is not allowed on non-DEGRADED mount never happen + * as it is rejected beforehand. + */ + /* + * Error case: one sequential and one conventional zone, but having larger + * last_alloc than write pointer. + */ + { + .description = "RAID1: fail: seq zone and conv zone, larger last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_RAID1, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = SZ_2M, + .expected_result = -EIO, + }, + + /* RAID0 */ + /* Normal case */ + { + .description = "RAID0: initial partial write", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + HALF_STRIPE_LEN, 0, 0, 0, + }, + .expected_alloc_offset = HALF_STRIPE_LEN, + }, + { + .description = "RAID0: while in second stripe", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN + HALF_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + }, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5 + HALF_STRIPE_LEN, + }, + { + .description = "RAID0: one stripe advanced", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M + BTRFS_STRIPE_LEN, SZ_1M, + }, + .expected_alloc_offset = SZ_2M + BTRFS_STRIPE_LEN, + }, + /* Error case: having different write pointers. */ + { + .description = "RAID0: fail: disordered stripes", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN * 2, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + }, + .expected_result = -EIO, + }, + { + .description = "RAID0: fail: far distance", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 3, BTRFS_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + }, + .expected_result = -EIO, + }, + { + .description = "RAID0: fail: too many partial write", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + HALF_STRIPE_LEN, HALF_STRIPE_LEN, 0, 0, + }, + .expected_result = -EIO, + }, + /* + * Error case: Partial missing device is not allowed even on non-DEGRADED + * mount. + */ + { + .description = "RAID0: fail: missing device on DEGRADED", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_MISSING_DEV, + }, + .degraded = true, + .expected_result = -EIO, + }, + + /* + * One sequential zone and one conventional zone, having matching + * last_alloc. + */ + { + .description = "RAID0: seq zone and conv zone, partially written stripe", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = SZ_2M - SZ_4K, + .expected_alloc_offset = SZ_2M - SZ_4K, + }, + { + .description = "RAID0: conv zone and seq zone, partially written stripe", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 2, + .alloc_offsets = { + WP_CONVENTIONAL, SZ_1M, + }, + .last_alloc = SZ_2M + SZ_4K, + .expected_alloc_offset = SZ_2M + SZ_4K, + }, + /* + * Error case: one sequential and one conventional zone, but having larger + * last_alloc than write pointer. + */ + { + .description = "RAID0: fail: seq zone and conv zone, larger last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 2, + .alloc_offsets = { + SZ_1M, WP_CONVENTIONAL, + }, + .last_alloc = SZ_2M + BTRFS_STRIPE_LEN * 2, + .expected_result = -EIO, + }, + + /* RAID0, 4 stripes with seq zones and conv zones. */ + { + .description = "RAID0: stripes [2, 2, ?, ?] last_alloc = 6", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 6, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 6, + }, + { + .description = "RAID0: stripes [2, 2, ?, ?] last_alloc = 7.5", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, + }, + { + .description = "RAID0: stripes [3, ?, ?, ?] last_alloc = 1", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 3, WP_CONVENTIONAL, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 9, + }, + { + .description = "RAID0: stripes [2, ?, 1, ?] last_alloc = 5", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, WP_CONVENTIONAL, + BTRFS_STRIPE_LEN, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 5, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5, + }, + { + .description = "RAID0: fail: stripes [2, ?, 1, ?] last_alloc = 7", + .raid_type = BTRFS_BLOCK_GROUP_RAID0, + .num_stripes = 4, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, WP_CONVENTIONAL, + BTRFS_STRIPE_LEN, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 7, + .expected_result = -EIO, + }, + + /* RAID10 */ + /* Normal case */ + { + .description = "RAID10: initial partial write", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + HALF_STRIPE_LEN, HALF_STRIPE_LEN, 0, 0, + }, + .expected_alloc_offset = HALF_STRIPE_LEN, + }, + { + .description = "RAID10: while in second stripe", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + BTRFS_STRIPE_LEN + HALF_STRIPE_LEN, + BTRFS_STRIPE_LEN + HALF_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + }, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5 + HALF_STRIPE_LEN, + }, + { + .description = "RAID10: one stripe advanced", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + SZ_1M + BTRFS_STRIPE_LEN, SZ_1M + BTRFS_STRIPE_LEN, + SZ_1M, SZ_1M, + }, + .expected_alloc_offset = SZ_2M + BTRFS_STRIPE_LEN, + }, + { + .description = "RAID10: one stripe advanced, with conventional zone", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + SZ_1M + BTRFS_STRIPE_LEN, WP_CONVENTIONAL, + WP_CONVENTIONAL, SZ_1M, + }, + .expected_alloc_offset = SZ_2M + BTRFS_STRIPE_LEN, + }, + /* Error case: having different write pointers. */ + { + .description = "RAID10: fail: disordered stripes", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + }, + .expected_result = -EIO, + }, + { + .description = "RAID10: fail: far distance", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 3, BTRFS_STRIPE_LEN * 3, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + }, + .expected_result = -EIO, + }, + { + .description = "RAID10: fail: too many partial write", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + HALF_STRIPE_LEN, HALF_STRIPE_LEN, + HALF_STRIPE_LEN, HALF_STRIPE_LEN, + 0, 0, 0, 0, + }, + .expected_result = -EIO, + }, + /* + * Error case: Partial missing device in RAID0 level is not allowed even on + * non-DEGRADED mount. + */ + { + .description = "RAID10: fail: missing device on DEGRADED", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + SZ_1M, SZ_1M, + WP_MISSING_DEV, WP_MISSING_DEV, + }, + .degraded = true, + .expected_result = -EIO, + }, + + /* + * One sequential zone and one conventional zone, having matching + * last_alloc. + */ + { + .description = "RAID10: seq zone and conv zone, partially written stripe", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + SZ_1M, SZ_1M, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = SZ_2M - SZ_4K, + .expected_alloc_offset = SZ_2M - SZ_4K, + }, + { + .description = "RAID10: conv zone and seq zone, partially written stripe", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + WP_CONVENTIONAL, WP_CONVENTIONAL, + SZ_1M, SZ_1M, + }, + .last_alloc = SZ_2M + SZ_4K, + .expected_alloc_offset = SZ_2M + SZ_4K, + }, + /* + * Error case: one sequential and one conventional zone, but having larger + * last_alloc than write pointer. + */ + { + .description = "RAID10: fail: seq zone and conv zone, larger last_alloc", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 4, + .alloc_offsets = { + SZ_1M, SZ_1M, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = SZ_2M + BTRFS_STRIPE_LEN * 2, + .expected_result = -EIO, + }, + + /* RAID10, 4 stripes with seq zones and conv zones. */ + { + .description = "RAID10: stripes [2, 2, ?, ?] last_alloc = 6", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + WP_CONVENTIONAL, WP_CONVENTIONAL, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 6, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 6, + }, + { + .description = "RAID10: stripes [2, 2, ?, ?] last_alloc = 7.5", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + WP_CONVENTIONAL, WP_CONVENTIONAL, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 7 + HALF_STRIPE_LEN, + }, + { + .description = "RAID10: stripes [3, ?, ?, ?] last_alloc = 1", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 3, BTRFS_STRIPE_LEN * 3, + WP_CONVENTIONAL, WP_CONVENTIONAL, + WP_CONVENTIONAL, WP_CONVENTIONAL, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 9, + }, + { + .description = "RAID10: stripes [2, ?, 1, ?] last_alloc = 5", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + WP_CONVENTIONAL, WP_CONVENTIONAL, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 5, + .expected_alloc_offset = BTRFS_STRIPE_LEN * 5, + }, + { + .description = "RAID10: fail: stripes [2, ?, 1, ?] last_alloc = 7", + .raid_type = BTRFS_BLOCK_GROUP_RAID10, + .num_stripes = 8, + .alloc_offsets = { + BTRFS_STRIPE_LEN * 2, BTRFS_STRIPE_LEN * 2, + WP_CONVENTIONAL, WP_CONVENTIONAL, + BTRFS_STRIPE_LEN, BTRFS_STRIPE_LEN, + WP_CONVENTIONAL, WP_CONVENTIONAL, + }, + .last_alloc = BTRFS_STRIPE_LEN * 7, + .expected_result = -EIO, + }, +}; + +int btrfs_test_zoned(void) +{ + struct btrfs_fs_info *fs_info __free(btrfs_free_dummy_fs_info) = NULL; + int ret; + + test_msg("running zoned tests (error messages are expected)"); + + fs_info = btrfs_alloc_dummy_fs_info(PAGE_SIZE, PAGE_SIZE); + if (!fs_info) { + test_std_err(TEST_ALLOC_FS_INFO); + return -ENOMEM; + } + + for (int i = 0; i < ARRAY_SIZE(load_zone_info_tests); i++) { + ret = test_load_zone_info(fs_info, &load_zone_info_tests[i]); + if (ret) { + test_err("test case \"%s\" failed", load_zone_info_tests[i].description); + return ret; + } + } + + return 0; +} diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 8dd77c431974..248adb785051 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -15,6 +15,7 @@ #include "misc.h" #include "ctree.h" #include "disk-io.h" +#include "extent_io.h" #include "transaction.h" #include "locking.h" #include "tree-log.h" @@ -274,7 +275,7 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, spin_lock(&fs_info->trans_lock); loop: /* The file system has been taken offline. No new transactions. */ - if (BTRFS_FS_ERROR(fs_info)) { + if (unlikely(BTRFS_FS_ERROR(fs_info))) { spin_unlock(&fs_info->trans_lock); return -EROFS; } @@ -332,7 +333,7 @@ loop: btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); kfree(cur_trans); goto loop; - } else if (BTRFS_FS_ERROR(fs_info)) { + } else if (unlikely(BTRFS_FS_ERROR(fs_info))) { spin_unlock(&fs_info->trans_lock); btrfs_lockdep_release(fs_info, btrfs_trans_num_extwriters); btrfs_lockdep_release(fs_info, btrfs_trans_num_writers); @@ -503,7 +504,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&fs_info->reloc_mutex); - ret = record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, false); mutex_unlock(&fs_info->reloc_mutex); return ret; @@ -611,7 +612,7 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, bool do_chunk_alloc = false; int ret; - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) return ERR_PTR(-EROFS); if (current->journal_info) { @@ -678,6 +679,14 @@ start_transaction(struct btrfs_root *root, unsigned int num_items, * here. */ ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + ret = btrfs_commit_current_transaction(root); + if (ret) + goto reserve_fail; + ret = btrfs_delayed_refs_rsv_refill(fs_info, flush); + } + if (ret) goto reserve_fail; } @@ -688,6 +697,8 @@ again: goto alloc_fail; } + xa_init(&h->writeback_inhibited_ebs); + /* * If we are JOIN_NOLOCK we're already committing a transaction and * waiting on this guy, so we don't need to do the sb_start_intwrite @@ -1084,6 +1095,13 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (trans->type & __TRANS_FREEZABLE) sb_end_intwrite(info->sb); + /* + * Uninhibit extent buffer writeback before decrementing num_writers, + * since the decrement wakes the committing thread which needs all + * buffers uninhibited to write them to disk. + */ + btrfs_uninhibit_all_eb_writeback(trans); + WARN_ON(cur_trans != info->running_transaction); WARN_ON(atomic_read(&cur_trans->num_writers) < 1); atomic_dec(&cur_trans->num_writers); @@ -1102,7 +1120,7 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (throttle) btrfs_run_delayed_iputs(info); - if (TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info)) { + if (unlikely(TRANS_ABORTED(trans) || BTRFS_FS_ERROR(info))) { wake_up_process(info->transaction_kthread); if (TRANS_ABORTED(trans)) ret = trans->aborted; @@ -1571,7 +1589,7 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * recorded root will never be updated again, causing an outdated root * item. */ - ret = record_root_in_trans(trans, src, 1); + ret = record_root_in_trans(trans, src, true); if (ret) return ret; @@ -1594,16 +1612,16 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, ret = commit_fs_roots(trans); if (ret) - goto out; + return ret; ret = btrfs_qgroup_account_extents(trans); if (ret < 0) - goto out; + return ret; /* Now qgroup are all updated, we can inherit it to new qgroups */ ret = btrfs_qgroup_inherit(trans, btrfs_root_id(src), dst_objectid, btrfs_root_id(parent), inherit); if (ret < 0) - goto out; + return ret; /* * Now we do a simplified commit transaction, which will: @@ -1619,23 +1637,22 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, */ ret = commit_cowonly_roots(trans); if (ret) - goto out; + return ret; switch_commit_roots(trans); ret = btrfs_write_and_wait_transaction(trans); - if (unlikely(ret)) + if (unlikely(ret)) { btrfs_err(fs_info, "error while writing out transaction during qgroup snapshot accounting: %d", ret); + return ret; + } -out: /* * Force parent root to be updated, as we recorded it before so its * last_trans == cur_transid. * Or it won't be committed again onto disk after later * insert_dir_item() */ - if (!ret) - ret = record_root_in_trans(trans, parent, 1); - return ret; + return record_root_in_trans(trans, parent, true); } /* @@ -1662,7 +1679,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; - struct extent_buffer *old; + struct extent_buffer *root_eb; struct timespec64 cur_time; int ret = 0; u64 to_reserve = 0; @@ -1719,7 +1736,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trans->transid, trans->bytes_reserved, 1); parent_root = parent_inode->root; - ret = record_root_in_trans(trans, parent_root, 0); + ret = record_root_in_trans(trans, parent_root, false); if (unlikely(ret)) goto fail; cur_time = current_time(&parent_inode->vfs_inode); @@ -1737,7 +1754,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, btrfs_ino(parent_inode), &fname.disk_name, 0); - if (unlikely(dir_item != NULL && !IS_ERR(dir_item))) { + if (!IS_ERR_OR_NULL(dir_item)) { pending->error = -EEXIST; goto dir_item_existed; } else if (IS_ERR(dir_item)) { @@ -1767,7 +1784,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - ret = record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, false); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1800,20 +1817,10 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec); btrfs_set_root_otransid(new_root_item, trans->transid); - old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, - BTRFS_NESTING_COW); - if (unlikely(ret)) { - btrfs_tree_unlock(old); - free_extent_buffer(old); - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_copy_root(trans, root, old, &tmp, objectid); - /* clean up in any case */ - btrfs_tree_unlock(old); - free_extent_buffer(old); + root_eb = btrfs_lock_root_node(root); + ret = btrfs_copy_root(trans, root, root_eb, &tmp, objectid); + btrfs_tree_unlock(root_eb); + free_extent_buffer(root_eb); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1921,7 +1928,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ if (ret == -EOVERFLOW) ret = 0; - if (unlikely(ret && ret != -EEXIST)) { + if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); goto fail; } @@ -2127,6 +2134,7 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) btrfs_scrub_cancel(fs_info); + btrfs_uninhibit_all_eb_writeback(trans); kmem_cache_free(btrfs_trans_handle_cachep, trans); } @@ -2343,7 +2351,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) * abort to prevent writing a new superblock that reflects a * corrupt state (pointing to trees with unwritten nodes/leafs). */ - if (BTRFS_FS_ERROR(fs_info)) { + if (unlikely(BTRFS_FS_ERROR(fs_info))) { spin_unlock(&fs_info->trans_lock); ret = -EROFS; goto lockdep_release; @@ -2566,6 +2574,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) fs_info->cleaner_kthread) wake_up_process(fs_info->cleaner_kthread); + /* + * Uninhibit writeback on all extent buffers inhibited during this + * transaction before writing them to disk. Inhibiting prevented + * writeback while the transaction was building, but now we need + * them written. + */ + btrfs_uninhibit_all_eb_writeback(trans); + ret = btrfs_write_and_wait_transaction(trans); if (unlikely(ret)) { btrfs_err(fs_info, "error while writing out transaction: %d", ret); @@ -2573,7 +2589,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) goto scrub_continue; } - ret = write_all_supers(fs_info, 0); + ret = write_all_supers(trans); /* * the super is written, we can safely allow the tree-loggers * to go about their business @@ -2641,8 +2657,6 @@ cleanup_transaction: btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; btrfs_warn(fs_info, "Skipping commit of aborted transaction."); - if (current->journal_info == trans) - current->journal_info = NULL; cleanup_transaction(trans, ret); return ret; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 18ef069197e5..7d70fe486758 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,6 +12,7 @@ #include #include #include +#include #include "btrfs_inode.h" #include "delayed-ref.h" @@ -162,6 +163,8 @@ struct btrfs_trans_handle { struct btrfs_fs_info *fs_info; struct list_head new_bgs; struct btrfs_block_rsv delayed_rsv; + /* Extent buffers with writeback inhibited by this handle. */ + struct xarray writeback_inhibited_ebs; }; /* diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index b4e114efff45..1f15d0793a9c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -777,6 +777,47 @@ static int check_block_group_item(struct extent_buffer *leaf, BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA); return -EUCLEAN; } + + if (unlikely(!btrfs_fs_incompat(fs_info, REMAP_TREE) && + type == BTRFS_BLOCK_GROUP_METADATA_REMAP)) { + block_group_err(leaf, slot, + "invalid type, METADATA_REMAP set but REMAP_TREE incompat flag not set"); + return -EUCLEAN; + } + + if (unlikely(!btrfs_fs_incompat(fs_info, REMAP_TREE) && + flags & BTRFS_BLOCK_GROUP_REMAPPED)) { + block_group_err(leaf, slot, + "invalid flags, REMAPPED set but REMAP_TREE incompat flag not set"); + return -EUCLEAN; + } + + if (item_size == sizeof(struct btrfs_block_group_item_v2)) { + struct btrfs_block_group_item_v2 *bgi2; + u64 remap_bytes; + u32 identity_remap_count; + + bgi2 = btrfs_item_ptr(leaf, slot, struct btrfs_block_group_item_v2); + remap_bytes = btrfs_block_group_v2_remap_bytes(leaf, bgi2); + + if (unlikely(remap_bytes > key->offset)) { + block_group_err(leaf, slot, + "invalid remap_bytes, have %llu expect [0, %llu]", + remap_bytes, key->offset); + return -EUCLEAN; + } + + identity_remap_count = btrfs_block_group_v2_identity_remap_count(leaf, bgi2); + if (unlikely((u64)identity_remap_count > + key->offset >> fs_info->sectorsize_bits)) { + block_group_err(leaf, slot, + "invalid identity_remap_count, have %u expect [0, %llu]", + identity_remap_count, + key->offset >> fs_info->sectorsize_bits); + return -EUCLEAN; + } + } + return 0; } @@ -999,6 +1040,20 @@ int btrfs_check_chunk_valid(const struct btrfs_fs_info *fs_info, } } + if (unlikely((type & BTRFS_BLOCK_GROUP_METADATA_REMAP) && + !(features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE))) { + chunk_err(fs_info, leaf, chunk, logical, + "METADATA_REMAP chunk type without REMAP_TREE incompat bit"); + return -EUCLEAN; + } + + if (unlikely(remapped && + !(features & BTRFS_FEATURE_INCOMPAT_REMAP_TREE))) { + chunk_err(fs_info, leaf, chunk, logical, + "REMAPPED chunk flag without REMAP_TREE incompat bit"); + return -EUCLEAN; + } + if (!remapped && !valid_stripe_count(type & BTRFS_BLOCK_GROUP_PROFILE_MASK, num_stripes, sub_stripes)) { @@ -1879,6 +1934,71 @@ static int check_raid_stripe_extent(const struct extent_buffer *leaf, return 0; } +static int check_remap_key(const struct extent_buffer *leaf, + const struct btrfs_key *key, int slot) +{ + const u32 item_size = btrfs_item_size(leaf, slot); + const u32 sectorsize = leaf->fs_info->sectorsize; + u64 end; + + if (unlikely(!btrfs_fs_incompat(leaf->fs_info, REMAP_TREE))) { + generic_err(leaf, slot, + "remap key type %u present but REMAP_TREE incompat bit unset", + key->type); + return -EUCLEAN; + } + + switch (key->type) { + case BTRFS_IDENTITY_REMAP_KEY: + if (unlikely(item_size != 0)) { + generic_err(leaf, slot, + "invalid item size for IDENTITY_REMAP, have %u expect 0", + item_size); + return -EUCLEAN; + } + break; + case BTRFS_REMAP_KEY: + case BTRFS_REMAP_BACKREF_KEY: + if (unlikely(item_size != sizeof(struct btrfs_remap_item))) { + generic_err(leaf, slot, + "invalid item size for remap key type %u, have %u expect %zu", + key->type, item_size, + sizeof(struct btrfs_remap_item)); + return -EUCLEAN; + } + break; + } + + if (unlikely(key->offset == 0)) { + generic_err(leaf, slot, + "invalid remap key length, have 0 expect nonzero"); + return -EUCLEAN; + } + + if (unlikely(!IS_ALIGNED(key->objectid, sectorsize))) { + generic_err(leaf, slot, + "invalid remap key objectid, have %llu expect aligned to %u", + key->objectid, sectorsize); + return -EUCLEAN; + } + + if (unlikely(!IS_ALIGNED(key->offset, sectorsize))) { + generic_err(leaf, slot, + "invalid remap key offset (length), have %llu expect aligned to %u", + key->offset, sectorsize); + return -EUCLEAN; + } + + if (unlikely(check_add_overflow(key->objectid, key->offset, &end))) { + generic_err(leaf, slot, + "remap key overflow, objectid %llu + offset %llu wraps", + key->objectid, key->offset); + return -EUCLEAN; + } + + return 0; +} + static int check_dev_extent_item(const struct extent_buffer *leaf, const struct btrfs_key *key, int slot, @@ -1945,6 +2065,119 @@ static int check_dev_extent_item(const struct extent_buffer *leaf, return 0; } +static int check_free_space_info(struct extent_buffer *leaf, struct btrfs_key *key, + int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + struct btrfs_free_space_info *fsi; + const u32 blocksize = fs_info->sectorsize; + u32 flags; + + if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) { + generic_err(leaf, slot, + "free space info key objectid is not aligned to %u, has " BTRFS_KEY_FMT, + blocksize, BTRFS_KEY_FMT_VALUE(key)); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->offset, blocksize))) { + generic_err(leaf, slot, + "free space info key offset is not aligned to %u, has " BTRFS_KEY_FMT, + blocksize, BTRFS_KEY_FMT_VALUE(key)); + return -EUCLEAN; + } + if (unlikely(btrfs_item_size(leaf, slot) != + sizeof(struct btrfs_free_space_info))) { + generic_err(leaf, slot, + "invalid item size for free space info, has %u expect %zu", + btrfs_item_size(leaf, slot), + sizeof(struct btrfs_free_space_info)); + return -EUCLEAN; + } + fsi = btrfs_item_ptr(leaf, slot, struct btrfs_free_space_info); + flags = btrfs_free_space_flags(leaf, fsi); + if (unlikely(flags & ~BTRFS_FREE_SPACE_FLAGS_MASK)) { + generic_err(leaf, slot, + "unknown flags for free space info, has 0x%x valid mask 0x%lx", + flags, BTRFS_FREE_SPACE_FLAGS_MASK); + return -EUCLEAN; + } + if (unlikely(btrfs_free_space_extent_count(leaf, fsi) > + key->offset >> fs_info->sectorsize_bits)) { + generic_err(leaf, slot, + "suspicious extent count, has %u max valid %llu", + btrfs_free_space_extent_count(leaf, fsi), + key->offset >> fs_info->sectorsize_bits); + return -EUCLEAN; + } + return 0; +} + +static int check_free_space_extent(struct extent_buffer *leaf, struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + const u32 blocksize = fs_info->sectorsize; + + if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) { + generic_err(leaf, slot, + "free space extent key objectid is not aligned to %u, has " BTRFS_KEY_FMT, + blocksize, BTRFS_KEY_FMT_VALUE(key)); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->offset, blocksize))) { + generic_err(leaf, slot, + "free space extent key offset is not aligned to %u, has " BTRFS_KEY_FMT, + blocksize, BTRFS_KEY_FMT_VALUE(key)); + return -EUCLEAN; + } + if (unlikely(btrfs_item_size(leaf, slot) != 0)) { + generic_err(leaf, slot, + "invalid item size for free space info, has %u expect 0", + btrfs_item_size(leaf, slot)); + return -EUCLEAN; + } + return 0; +} + +static int check_free_space_bitmap(struct extent_buffer *leaf, + struct btrfs_key *key, int slot) +{ + struct btrfs_fs_info *fs_info = leaf->fs_info; + const u32 blocksize = fs_info->sectorsize; + u32 expected_item_size; + + if (unlikely(!IS_ALIGNED(key->objectid, blocksize))) { + generic_err(leaf, slot, + "free space bitmap key objectid is not aligned to %u, has " BTRFS_KEY_FMT, + blocksize, BTRFS_KEY_FMT_VALUE(key)); + return -EUCLEAN; + } + if (unlikely(!IS_ALIGNED(key->offset, blocksize))) { + generic_err(leaf, slot, + "free space bitmap key offset is not aligned to %u, has " BTRFS_KEY_FMT, + blocksize, BTRFS_KEY_FMT_VALUE(key)); + return -EUCLEAN; + } + if (unlikely(key->offset == 0)) { + generic_err(leaf, slot, "free space bitmap length is 0"); + return -EUCLEAN; + } + /* + * The item must hold exactly the right number of bitmap bytes for the + * range described by key->offset. A mismatch means the item was + * truncated or the key is corrupt; either way the bitmap data is not + * safe to access. + */ + expected_item_size = DIV_ROUND_UP(key->offset >> fs_info->sectorsize_bits, + BITS_PER_BYTE); + if (unlikely(btrfs_item_size(leaf, slot) != expected_item_size)) { + generic_err(leaf, slot, + "invalid item size for free space bitmap, has %u expect %u", + btrfs_item_size(leaf, slot), expected_item_size); + return -EUCLEAN; + } + return 0; +} + /* * Common point to switch the item-specific validation. */ @@ -2008,6 +2241,20 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, case BTRFS_RAID_STRIPE_KEY: ret = check_raid_stripe_extent(leaf, key, slot); break; + case BTRFS_FREE_SPACE_INFO_KEY: + ret = check_free_space_info(leaf, key, slot); + break; + case BTRFS_FREE_SPACE_EXTENT_KEY: + ret = check_free_space_extent(leaf, key, slot); + break; + case BTRFS_FREE_SPACE_BITMAP_KEY: + ret = check_free_space_bitmap(leaf, key, slot); + break; + case BTRFS_IDENTITY_REMAP_KEY: + case BTRFS_REMAP_KEY: + case BTRFS_REMAP_BACKREF_KEY: + ret = check_remap_key(leaf, key, slot); + break; } if (unlikely(ret)) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index ac871efb9763..9123adafa0d1 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -457,7 +457,7 @@ static int process_one_buffer(struct extent_buffer *eb, return ret; } - if (btrfs_buffer_uptodate(eb, gen, false) && level == 0) { + if (btrfs_buffer_uptodate(eb, gen, NULL) && level == 0) { ret = btrfs_exclude_logged_extents(eb); if (ret) btrfs_abort_transaction(trans, ret); @@ -1003,7 +1003,7 @@ static noinline int replay_one_extent(struct walk_control *wc) btrfs_root_id(root)); } if (!ret) { - ret = btrfs_csum_file_blocks(trans, csum_root, sums); + ret = btrfs_insert_data_csums(trans, csum_root, sums); if (ret) btrfs_abort_log_replay(wc, ret, "failed to add csums for range [%llu, %llu) inode %llu root %llu", @@ -1711,7 +1711,7 @@ static noinline int add_inode_ref(struct walk_control *wc) } /* insert our name */ - ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, false, ref_index); if (ret) { btrfs_abort_log_replay(wc, ret, "failed to add link for inode %llu in dir %llu ref_index %llu name %.*s root %llu", @@ -2059,7 +2059,7 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, return PTR_ERR(dir); } - ret = btrfs_add_link(trans, dir, inode, name, 1, index); + ret = btrfs_add_link(trans, dir, inode, name, true, index); /* FIXME, put inode into FIXUP list */ @@ -3566,7 +3566,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * writing the super here would result in transid mismatches. If there * is an error here just bail. */ - if (BTRFS_FS_ERROR(fs_info)) { + if (unlikely(BTRFS_FS_ERROR(fs_info))) { ret = -EIO; btrfs_set_log_full_commit(trans); btrfs_abort_transaction(trans, ret); @@ -3576,7 +3576,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, btrfs_set_super_log_root(fs_info->super_for_commit, log_root_start); btrfs_set_super_log_root_level(fs_info->super_for_commit, log_root_level); - ret = write_all_supers(fs_info, 1); + ret = write_all_supers(trans); mutex_unlock(&fs_info->tree_log_mutex); if (unlikely(ret)) { btrfs_set_log_full_commit(trans); @@ -3681,25 +3681,22 @@ static void free_log_tree(struct btrfs_trans_handle *trans, * free all the extents used by the tree log. This should be called * at commit time of the full transaction */ -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +void btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) { if (root->log_root) { free_log_tree(trans, root->log_root); root->log_root = NULL; clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &root->state); } - return 0; } -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) +void btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info) { if (fs_info->log_root_tree) { free_log_tree(trans, fs_info->log_root_tree); fs_info->log_root_tree = NULL; clear_bit(BTRFS_ROOT_HAS_LOG_TREE, &fs_info->tree_root->state); } - return 0; } static bool mark_inode_as_not_logged(const struct btrfs_trans_handle *trans, @@ -4613,10 +4610,11 @@ static int truncate_inode_items(struct btrfs_trans_handle *trans, static void fill_inode_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf, struct btrfs_inode_item *item, - struct inode *inode, bool log_inode_only, + struct btrfs_inode *inode, bool log_inode_only, u64 logged_isize) { - u64 gen = BTRFS_I(inode)->generation; + struct inode *vfs_inode = &inode->vfs_inode; + u64 gen = inode->generation; u64 flags; if (log_inode_only) { @@ -4631,33 +4629,33 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * and one can set it to 0 since that only happens on eviction * and we are holding a ref on the inode. */ - ASSERT(data_race(BTRFS_I(inode)->logged_trans) > 0); - if (data_race(BTRFS_I(inode)->logged_trans) < trans->transid) + ASSERT(data_race(inode->logged_trans) > 0); + if (data_race(inode->logged_trans) < trans->transid) gen = 0; btrfs_set_inode_size(leaf, item, logged_isize); } else { - btrfs_set_inode_size(leaf, item, inode->i_size); + btrfs_set_inode_size(leaf, item, vfs_inode->i_size); } btrfs_set_inode_generation(leaf, item, gen); - btrfs_set_inode_uid(leaf, item, i_uid_read(inode)); - btrfs_set_inode_gid(leaf, item, i_gid_read(inode)); - btrfs_set_inode_mode(leaf, item, inode->i_mode); - btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + btrfs_set_inode_uid(leaf, item, i_uid_read(vfs_inode)); + btrfs_set_inode_gid(leaf, item, i_gid_read(vfs_inode)); + btrfs_set_inode_mode(leaf, item, vfs_inode->i_mode); + btrfs_set_inode_nlink(leaf, item, vfs_inode->i_nlink); - btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(inode)); - btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->atime, inode_get_atime_sec(vfs_inode)); + btrfs_set_timespec_nsec(leaf, &item->atime, inode_get_atime_nsec(vfs_inode)); - btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(inode)); - btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->mtime, inode_get_mtime_sec(vfs_inode)); + btrfs_set_timespec_nsec(leaf, &item->mtime, inode_get_mtime_nsec(vfs_inode)); - btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(inode)); - btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(inode)); + btrfs_set_timespec_sec(leaf, &item->ctime, inode_get_ctime_sec(vfs_inode)); + btrfs_set_timespec_nsec(leaf, &item->ctime, inode_get_ctime_nsec(vfs_inode)); - btrfs_set_timespec_sec(leaf, &item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_timespec_nsec(leaf, &item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_timespec_sec(leaf, &item->otime, inode->i_otime_sec); + btrfs_set_timespec_nsec(leaf, &item->otime, inode->i_otime_nsec); /* * We do not need to set the nbytes field, in fact during a fast fsync @@ -4668,11 +4666,10 @@ static void fill_inode_item(struct btrfs_trans_handle *trans, * inode item in subvolume tree as needed (see overwrite_item()). */ - btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(inode)); + btrfs_set_inode_sequence(leaf, item, inode_peek_iversion(vfs_inode)); btrfs_set_inode_transid(leaf, item, trans->transid); - btrfs_set_inode_rdev(leaf, item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_inode_rdev(leaf, item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_inode_flags(leaf, item, flags); btrfs_set_inode_block_group(leaf, item, 0); } @@ -4719,8 +4716,7 @@ static int log_inode_item(struct btrfs_trans_handle *trans, return ret; inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); - fill_inode_item(trans, path->nodes[0], inode_item, &inode->vfs_inode, - false, 0); + fill_inode_item(trans, path->nodes[0], inode_item, inode, false, 0); btrfs_release_path(path); return 0; } @@ -4740,7 +4736,7 @@ static int log_csums(struct btrfs_trans_handle *trans, * worry about logging checksum items with overlapping ranges. */ if (inode->last_reflink_trans < trans->transid) - return btrfs_csum_file_blocks(trans, log_root, sums); + return btrfs_insert_data_csums(trans, log_root, sums); /* * Serialize logging for checksums. This is to avoid racing with the @@ -4763,7 +4759,7 @@ static int log_csums(struct btrfs_trans_handle *trans, */ ret = btrfs_del_csums(trans, log_root, sums->logical, sums->len); if (!ret) - ret = btrfs_csum_file_blocks(trans, log_root, sums); + ret = btrfs_insert_data_csums(trans, log_root, sums); btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, &cached_state); @@ -4989,8 +4985,7 @@ copy_item: inode_item = btrfs_item_ptr(dst_path->nodes[0], dst_slot, struct btrfs_inode_item); fill_inode_item(trans, dst_path->nodes[0], inode_item, - &inode->vfs_inode, - inode_only == LOG_INODE_EXISTS, + inode, inode_only == LOG_INODE_EXISTS, logged_isize); } else { copy_extent_buffer(dst_path->nodes[0], src, dst_offset, @@ -5088,7 +5083,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) continue; - list_for_each_entry(sums, &ordered->list, list) { + list_for_each_entry(sums, &ordered->csum_list, list) { ret = log_csums(trans, inode, log_root, sums); if (ret) return ret; @@ -5803,7 +5798,7 @@ static int btrfs_check_ref_name_override(struct extent_buffer *eb, name_str.len = this_name_len; di = btrfs_lookup_dir_item(NULL, inode->root, search_path, parent, &name_str, 0); - if (di && !IS_ERR(di)) { + if (!IS_ERR_OR_NULL(di)) { struct btrfs_key di_key; btrfs_dir_item_key_to_cpu(search_path->nodes[0], diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 41e47fda036d..4a626dc6a58b 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -71,9 +71,8 @@ static inline int btrfs_need_log_full_commit(struct btrfs_trans_handle *trans) int btrfs_sync_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_log_ctx *ctx); -int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); -int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); +void btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c index 603c1457130e..a8094928f4c9 100644 --- a/fs/btrfs/tree-mod-log.c +++ b/fs/btrfs/tree-mod-log.c @@ -1042,12 +1042,10 @@ struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) check.owner_root = btrfs_root_id(root); old = read_tree_block(fs_info, logical, &check); - if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { - if (!IS_ERR(old)) - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old))) { btrfs_warn(fs_info, - "failed to read tree block %llu from get_old_root", - logical); + "failed to read tree block %llu from get_old_root: %ld", + logical, PTR_ERR(old)); } else { struct tree_mod_elem *tm2; diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index 43c17a1d3451..467dff7212d6 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -35,7 +35,7 @@ static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, const u8 *uuid, struct btrfs_key key; if (WARN_ON_ONCE(!uuid_root)) - return -ENOENT; + return -EINVAL; path = btrfs_alloc_path(); if (!path) @@ -92,9 +92,6 @@ int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans, const u8 *uuid, u8 typ if (ret != -ENOENT) return ret; - if (WARN_ON_ONCE(!uuid_root)) - return -EINVAL; - btrfs_uuid_to_key(uuid, type, &key); path = btrfs_alloc_path(); @@ -516,7 +513,7 @@ skip: out: btrfs_free_path(path); - if (trans && !IS_ERR(trans)) + if (!IS_ERR_OR_NULL(trans)) btrfs_end_transaction(trans); if (ret) btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0af16946dcda..a88e68f90564 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -392,6 +392,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid) INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); INIT_LIST_HEAD(&fs_devs->seed_list); + spin_lock_init(&fs_devs->per_profile_lock); if (fsid) { memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -2339,6 +2340,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, mutex_lock(&fs_info->chunk_mutex); list_del_init(&device->dev_alloc_list); device->fs_devices->rw_devices--; + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); } @@ -2450,6 +2452,7 @@ error_undo: list_add(&device->dev_alloc_list, &fs_devices->alloc_list); device->fs_devices->rw_devices++; + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); } return ret; @@ -2937,6 +2940,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path */ btrfs_clear_space_info_full(fs_info); + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ @@ -2947,6 +2951,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_lock(&fs_info->chunk_mutex); ret = init_first_rw_device(trans); + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); if (unlikely(ret)) { btrfs_abort_transaction(trans, ret); @@ -3029,6 +3034,7 @@ error_sysfs: orig_super_total_bytes); btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices); + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex); error_trans: @@ -3121,6 +3127,7 @@ int btrfs_grow_device(struct btrfs_trans_handle *trans, if (list_empty(&device->post_commit_list)) list_add_tail(&device->post_commit_list, &trans->transaction->dev_update_list); + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); btrfs_reserve_chunk_metadata(trans, false); @@ -3497,6 +3504,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) } } + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); trans->removing_chunk = false; @@ -3594,7 +3602,7 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset, bool v * If we had a transaction abort, stop all running scrubs. * See transaction.c:cleanup_transaction() why we do it here. */ - if (BTRFS_FS_ERROR(fs_info)) + if (unlikely(BTRFS_FS_ERROR(fs_info))) btrfs_scrub_cancel(fs_info); return ret; } @@ -5200,6 +5208,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) atomic64_sub(free_diff, &fs_info->free_chunk_space); } + btrfs_update_per_profile_avail(fs_info); /* * Once the device's size has been set to the new size, ensure all * in-memory chunks are synced to disk so that the loop below sees them @@ -5315,6 +5324,7 @@ again: WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, round_down(old_total - diff, fs_info->sectorsize)); + btrfs_update_per_profile_avail(fs_info); mutex_unlock(&fs_info->chunk_mutex); btrfs_reserve_chunk_metadata(trans, false); @@ -5387,6 +5397,168 @@ static int btrfs_cmp_device_info(const void *a, const void *b) return 0; } +/* + * Return 0 if we allocated any virtual(*) chunk, and restore the size to + * @allocated. + * Return -ENOSPC if we have no more space to allocate virtual chunk + * + * *: A virtual chunk is a chunk that only exists during per-profile available + * estimation. + * Those numbers won't really take on-disk space, but only to emulate + * chunk allocator behavior to get accurate estimation on available space. + * + * Another difference is, a virtual chunk has no size limit and doesn't care + * about holes in the device tree, allowing us to exhaust device space + * much faster. + */ +static int alloc_virtual_chunk(struct btrfs_fs_info *fs_info, + struct btrfs_device_info *devices_info, + enum btrfs_raid_types type, + u64 *allocated) +{ + const struct btrfs_raid_attr *raid_attr = &btrfs_raid_array[type]; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 stripe_size; + int ndevs = 0; + + lockdep_assert_held(&fs_info->chunk_mutex); + + /* Go through devices to collect their unallocated space. */ + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) { + u64 avail; + + if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, + &device->dev_state) || + test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) + continue; + + if (device->total_bytes > device->bytes_used + + device->per_profile_allocated) + avail = device->total_bytes - device->bytes_used - + device->per_profile_allocated; + else + avail = 0; + + avail = round_down(avail, fs_info->sectorsize); + + /* And exclude the [0, 1M) reserved space. */ + if (avail > BTRFS_DEVICE_RANGE_RESERVED) + avail -= BTRFS_DEVICE_RANGE_RESERVED; + else + avail = 0; + + /* + * Not enough to support a single stripe, this device + * can not be utilized for chunk allocation. + */ + if (avail < BTRFS_STRIPE_LEN) + continue; + + /* + * Unlike chunk allocator, we don't care about stripe or hole + * size, so here we use @avail directly. + */ + devices_info[ndevs].dev_offset = 0; + devices_info[ndevs].total_avail = avail; + devices_info[ndevs].max_avail = avail; + devices_info[ndevs].dev = device; + ++ndevs; + } + sort(devices_info, ndevs, sizeof(struct btrfs_device_info), + btrfs_cmp_device_info, NULL); + ndevs = rounddown(ndevs, raid_attr->devs_increment); + if (ndevs < raid_attr->devs_min) + return -ENOSPC; + if (raid_attr->devs_max) + ndevs = min(ndevs, (int)raid_attr->devs_max); + else + ndevs = min(ndevs, (int)BTRFS_MAX_DEVS(fs_info)); + + /* + * Stripe size will be determined by the device with the least + * unallocated space. + */ + stripe_size = devices_info[ndevs - 1].total_avail; + + for (int i = 0; i < ndevs; i++) + devices_info[i].dev->per_profile_allocated += stripe_size; + *allocated = div_u64(stripe_size * (ndevs - raid_attr->nparity), + raid_attr->ncopies); + return 0; +} + +static int calc_one_profile_avail(struct btrfs_fs_info *fs_info, + enum btrfs_raid_types type, + u64 *result_ret) +{ + struct btrfs_device_info *devices_info = NULL; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 allocated; + u64 result = 0; + int ret = 0; + + lockdep_assert_held(&fs_info->chunk_mutex); + ASSERT(type >= 0 && type < BTRFS_NR_RAID_TYPES); + + /* Not enough devices, quick exit, just update the result. */ + if (fs_devices->rw_devices < btrfs_raid_array[type].devs_min) { + ret = -ENOSPC; + goto out; + } + + devices_info = kzalloc_objs(*devices_info, fs_devices->rw_devices, GFP_NOFS); + if (!devices_info) { + ret = -ENOMEM; + goto out; + } + /* Clear virtual chunk used space for each device. */ + list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) + device->per_profile_allocated = 0; + + while (!alloc_virtual_chunk(fs_info, devices_info, type, &allocated)) + result += allocated; + +out: + kfree(devices_info); + if (ret < 0 && ret != -ENOSPC) + return ret; + *result_ret = result; + return 0; +} + +/* Update the per-profile available space array. */ +void btrfs_update_per_profile_avail(struct btrfs_fs_info *fs_info) +{ + u64 results[BTRFS_NR_RAID_TYPES]; + int ret; + + /* + * Zoned is more complex as we can not simply get the amount of + * available space for each device. + */ + if (btrfs_is_zoned(fs_info)) + goto error; + + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + ret = calc_one_profile_avail(fs_info, i, &results[i]); + if (ret < 0) + goto error; + } + + spin_lock(&fs_info->fs_devices->per_profile_lock); + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) + fs_info->fs_devices->per_profile_avail[i] = results[i]; + spin_unlock(&fs_info->fs_devices->per_profile_lock); + return; +error: + spin_lock(&fs_info->fs_devices->per_profile_lock); + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) + fs_info->fs_devices->per_profile_avail[i] = U64_MAX; + spin_unlock(&fs_info->fs_devices->per_profile_lock); +} + static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) { if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK)) @@ -5864,6 +6036,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); + btrfs_update_per_profile_avail(info); + return block_group; } @@ -5901,8 +6075,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, ctl.space_info = space_info; init_alloc_chunk_ctl(fs_devices, &ctl); - devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), - GFP_NOFS); + devices_info = kzalloc_objs(*devices_info, fs_devices->rw_devices, GFP_NOFS); if (!devices_info) return ERR_PTR(-ENOMEM); @@ -8077,6 +8250,36 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) struct btrfs_device *device; int stats_cnt; int ret = 0; + bool need_update_dev_stats = false; + + /* + * Do an initial pass using RCU to see if we need to update any dev + * stats item. This is to avoid taking the device_list_mutex which is + * acquired by the fitrim operation and can take a while since it does + * discard operations while holding that mutex. Most of the time, if + * we are on a healthy filesystem, we don't have new stat updates, so + * this avoids blocking on that mutex, which is specially important + * because we are called during the critical section of a transaction + * commit, therefore blocking new transactions from starting while + * discard is running. + * + * Also note that adding/removing devices also requires starting a + * transaction, and since we are called from the critical section of a + * transaction commit, no one can be concurrently adding or removing a + * device. + */ + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { + if (device->dev_stats_valid && + atomic_read(&device->dev_stats_ccnt) != 0) { + need_update_dev_stats = true; + break; + } + } + rcu_read_unlock(); + + if (!need_update_dev_stats) + return 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { @@ -8439,7 +8642,14 @@ int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info) } /* Ensure all chunks have corresponding dev extents */ - return verify_chunk_dev_extent_mapping(fs_info); + ret = verify_chunk_dev_extent_mapping(fs_info); + if (ret < 0) + return ret; + + mutex_lock(&fs_info->chunk_mutex); + btrfs_update_per_profile_avail(fs_info); + mutex_unlock(&fs_info->chunk_mutex); + return 0; } /* @@ -8457,7 +8667,12 @@ bool btrfs_verify_dev_items(const struct btrfs_fs_info *fs_info) mutex_lock(&uuid_mutex); list_for_each_entry(dev, &fs_info->fs_devices->devices, dev_list) { - if (!test_bit(BTRFS_DEV_STATE_ITEM_FOUND, &dev->dev_state)) { + /* + * Replace target dev item (devid 0) is not inserted into chunk tree. + * So skip the DEV_STATE_ITEM check. + */ + if (dev->devid != BTRFS_DEV_REPLACE_DEVID && + !test_bit(BTRFS_DEV_STATE_ITEM_FOUND, &dev->dev_state)) { btrfs_err(fs_info, "devid %llu path %s is registered but not found in chunk tree", dev->devid, btrfs_dev_name(dev)); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 8288d79372a5..0082c166af91 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -22,6 +22,7 @@ #include #include "messages.h" #include "extent-io-tree.h" +#include "fs.h" struct block_device; struct bdev_handle; @@ -213,6 +214,12 @@ struct btrfs_device { /* Bandwidth limit for scrub, in bytes */ u64 scrub_speed_max; + + /* + * A temporary number of allocated space during per-profile + * available space calculation. + */ + u64 per_profile_allocated; }; /* @@ -458,6 +465,15 @@ struct btrfs_fs_devices { /* Device to be used for reading in case of RAID1. */ u64 read_devid; #endif + + /* + * Each value indicates the available space for that profile. + * U64_MAX means the estimation is unavailable. + * + * Protected by per_profile_lock; + */ + u64 per_profile_avail[BTRFS_NR_RAID_TYPES]; + spinlock_t per_profile_lock; }; #define BTRFS_MAX_DEVS(info) ((BTRFS_MAX_ITEM_SIZE(info) \ @@ -887,6 +903,24 @@ int btrfs_bg_type_to_factor(u64 flags); const char *btrfs_bg_type_to_raid_name(u64 flags); int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info); bool btrfs_verify_dev_items(const struct btrfs_fs_info *fs_info); +void btrfs_update_per_profile_avail(struct btrfs_fs_info *fs_info); + +static inline bool btrfs_get_per_profile_avail(struct btrfs_fs_info *fs_info, + u64 profile, u64 *avail_ret) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(profile); + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + bool uptodate = false; + + spin_lock(&fs_devices->per_profile_lock); + if (fs_devices->per_profile_avail[index] != U64_MAX) { + uptodate = true; + *avail_ret = fs_devices->per_profile_avail[index]; + } + spin_unlock(&fs_info->fs_devices->per_profile_lock); + return uptodate; +} + bool btrfs_repair_one_zone(struct btrfs_fs_info *fs_info, u64 logical); bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c676e715b4f8..486b52db583e 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -71,7 +71,6 @@ static bool need_special_buffer(struct btrfs_fs_info *fs_info) struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned int level) { - const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; int workspacesize; @@ -91,8 +90,8 @@ struct list_head *zlib_alloc_workspace(struct btrfs_fs_info *fs_info, unsigned i workspace->buf_size = ZLIB_DFLTCC_BUF_SIZE; } if (!workspace->buf) { - workspace->buf = kmalloc(blocksize, GFP_KERNEL); - workspace->buf_size = blocksize; + workspace->buf = kmalloc(fs_info->sectorsize, GFP_KERNEL); + workspace->buf_size = fs_info->sectorsize; } if (!workspace->strm.workspace || !workspace->buf) goto fail; @@ -157,10 +156,8 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) const u32 min_folio_size = btrfs_min_folio_size(fs_info); int ret; char *data_in = NULL; - char *cfolio_out; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - const u32 blocksize = fs_info->sectorsize; const u64 orig_end = start + len; ret = zlib_deflateInit(&workspace->strm, workspace->level); @@ -175,16 +172,15 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_folio = btrfs_alloc_compr_folio(fs_info); + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cfolio_out = folio_address(out_folio); workspace->strm.next_in = workspace->buf; workspace->strm.avail_in = 0; - workspace->strm.next_out = cfolio_out; + workspace->strm.next_out = folio_address(out_folio); workspace->strm.avail_out = min_folio_size; while (workspace->strm.total_in < len) { @@ -242,7 +238,7 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) } /* We're making it bigger, give up. */ - if (workspace->strm.total_in > blocksize * 2 && + if (workspace->strm.total_in > fs_info->sectorsize * 2 && workspace->strm.total_in < workspace->strm.total_out) { ret = -E2BIG; goto out; @@ -258,14 +254,13 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) goto out; } - out_folio = btrfs_alloc_compr_folio(fs_info); + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cfolio_out = folio_address(out_folio); workspace->strm.avail_out = min_folio_size; - workspace->strm.next_out = cfolio_out; + workspace->strm.next_out = folio_address(out_folio); } /* We're all done. */ if (workspace->strm.total_in >= len) @@ -296,14 +291,13 @@ int zlib_compress_bio(struct list_head *ws, struct compressed_bio *cb) goto out; } /* Get another folio for the stream end. */ - out_folio = btrfs_alloc_compr_folio(fs_info); + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (out_folio == NULL) { ret = -ENOMEM; goto out; } - cfolio_out = folio_address(out_folio); workspace->strm.avail_out = min_folio_size; - workspace->strm.next_out = cfolio_out; + workspace->strm.next_out = folio_address(out_folio); } } /* Queue the remaining part of the folio. */ @@ -351,7 +345,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) int wbits = MAX_WBITS; char *data_in; size_t total_out = 0; - size_t srclen = cb->compressed_len; + const size_t srclen = bio_get_size(&cb->bbio.bio); unsigned long buf_start; bio_first_folio(&fi, &cb->bbio.bio, 0); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 0cd7fd3fcfa3..16dd87aa06f2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1699,8 +1699,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, return -EINVAL; } - raid0_allocs = kcalloc(map->num_stripes / map->sub_stripes, sizeof(*raid0_allocs), - GFP_NOFS); + raid0_allocs = kzalloc_objs(*raid0_allocs, map->num_stripes / map->sub_stripes, GFP_NOFS); if (!raid0_allocs) return -ENOMEM; @@ -1918,7 +1917,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) cache->physical_map = map; - zone_info = kcalloc(map->num_stripes, sizeof(*zone_info), GFP_NOFS); + zone_info = kzalloc_objs(*zone_info, map->num_stripes, GFP_NOFS); if (!zone_info) { ret = -ENOMEM; goto out; @@ -2123,9 +2122,8 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) return; - ASSERT(!list_empty(&ordered->list)); - /* The ordered->list can be empty in the above pre-alloc case. */ - sum = list_first_entry(&ordered->list, struct btrfs_ordered_sum, list); + ASSERT(!list_empty(&ordered->csum_list)); + sum = list_first_entry(&ordered->csum_list, struct btrfs_ordered_sum, list); logical = sum->logical; len = sum->len; @@ -2136,7 +2134,7 @@ void btrfs_finish_ordered_zoned(struct btrfs_ordered_extent *ordered) continue; } if (!btrfs_zoned_split_ordered(ordered, logical, len)) { - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + btrfs_mark_ordered_extent_error(ordered); btrfs_err(fs_info, "failed to split ordered extent"); goto out; } @@ -2156,7 +2154,7 @@ out: */ if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) { - while ((sum = list_first_entry_or_null(&ordered->list, + while ((sum = list_first_entry_or_null(&ordered->csum_list, typeof(*sum), list))) { list_del(&sum->list); kfree(sum); @@ -2386,6 +2384,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) if (!btrfs_is_zoned(block_group->fs_info)) return true; + if (unlikely(btrfs_is_testing(fs_info))) + return true; + map = block_group->physical_map; spin_lock(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 3e847b91dae3..86919293fd54 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -370,7 +370,6 @@ void zstd_free_workspace(struct list_head *ws) struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) { - const u32 blocksize = fs_info->sectorsize; struct workspace *workspace; workspace = kzalloc_obj(*workspace); @@ -383,7 +382,7 @@ struct list_head *zstd_alloc_workspace(struct btrfs_fs_info *fs_info, int level) workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); - workspace->buf = kmalloc(blocksize, GFP_KERNEL); + workspace->buf = kmalloc(fs_info->sectorsize, GFP_KERNEL); if (!workspace->mem || !workspace->buf) goto fail; @@ -414,7 +413,6 @@ int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) const u64 start = cb->start; const u32 len = cb->len; const u64 end = start + len; - const u32 blocksize = fs_info->sectorsize; const u32 min_folio_size = btrfs_min_folio_size(fs_info); workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); @@ -439,7 +437,7 @@ int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->in_buf.size = btrfs_calc_input_length(in_folio, end, start); /* Allocate and map in the output buffer. */ - out_folio = btrfs_alloc_compr_folio(fs_info); + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -463,7 +461,7 @@ int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) } /* Check to see if we are making it bigger. */ - if (tot_in + workspace->in_buf.pos > blocksize * 2 && + if (tot_in + workspace->in_buf.pos > fs_info->sectorsize * 2 && tot_in + workspace->in_buf.pos < tot_out + workspace->out_buf.pos) { ret = -E2BIG; goto out; @@ -482,7 +480,7 @@ int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) goto out; } - out_folio = btrfs_alloc_compr_folio(fs_info); + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -555,7 +553,7 @@ int zstd_compress_bio(struct list_head *ws, struct compressed_bio *cb) ret = -E2BIG; goto out; } - out_folio = btrfs_alloc_compr_folio(fs_info); + out_folio = btrfs_alloc_compr_folio(fs_info, GFP_NOFS); if (out_folio == NULL) { ret = -ENOMEM; goto out; @@ -587,10 +585,9 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) struct btrfs_fs_info *fs_info = cb_to_fs_info(cb); struct workspace *workspace = list_entry(ws, struct workspace, list); struct folio_iter fi; - size_t srclen = cb->compressed_len; + size_t srclen = bio_get_size(&cb->bbio.bio); zstd_dstream *stream; int ret = 0; - const u32 blocksize = fs_info->sectorsize; const unsigned int min_folio_size = btrfs_min_folio_size(fs_info); unsigned long folio_in_index = 0; unsigned long total_folios_in = DIV_ROUND_UP(srclen, min_folio_size); @@ -620,7 +617,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = blocksize; + workspace->out_buf.size = fs_info->sectorsize; while (1) { size_t ret2; @@ -682,7 +679,6 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, { struct workspace *workspace = list_entry(ws, struct workspace, list); struct btrfs_fs_info *fs_info = btrfs_sb(folio_inode(dest_folio)->i_sb); - const u32 sectorsize = fs_info->sectorsize; zstd_dstream *stream; int ret = 0; unsigned long to_copy = 0; @@ -706,7 +702,7 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, workspace->out_buf.dst = workspace->buf; workspace->out_buf.pos = 0; - workspace->out_buf.size = sectorsize; + workspace->out_buf.size = fs_info->sectorsize; /* * Since both input and output buffers should not exceed one sector, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 0864700f76e0..8ad7a2d76c1d 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1113,6 +1113,30 @@ TRACE_EVENT(btrfs_cow_block, __entry->cow_level) ); +TRACE_EVENT(btrfs_search_slot_restart, + + TP_PROTO(const struct btrfs_root *root, int level, + const char *reason), + + TP_ARGS(root, level, reason), + + TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) + __field( int, level ) + __string( reason, reason ) + ), + + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = btrfs_root_id(root); + __entry->level = level; + __assign_str(reason); + ), + + TP_printk_btrfs("root=%llu(%s) level=%d reason=%s", + show_root_type(__entry->root_objectid), + __entry->level, __get_str(reason)) +); + TRACE_EVENT(btrfs_space_reservation, TP_PROTO(const struct btrfs_fs_info *fs_info, const char *type, u64 val, diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index f7843e6bb978..cc3b9f7dccaf 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -1245,7 +1245,8 @@ struct btrfs_free_space_info { __le32 flags; } __attribute__ ((__packed__)); -#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) +#define BTRFS_FREE_SPACE_USING_BITMAPS (1UL << 0) +#define BTRFS_FREE_SPACE_FLAGS_MASK (BTRFS_FREE_SPACE_USING_BITMAPS) #define BTRFS_QGROUP_LEVEL_SHIFT 48 static inline __u16 btrfs_qgroup_level(__u64 qgroupid)