for-6.17-rc2-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmijTRYACgkQxWXV+ddt
 WDtXRg//ZY9dwFlXpjvddCUZC+LnNkF2GQOK9Rudd6KGZWphNSg2EBHPY6Om26rd
 KVpGtj7MXf/ULXYjb0833+2j/UVRj70Y2ubIC58LcEL6lavtUqweu0x+aHAjayV8
 xU7cbie+JQZhGFe9aAbp0d1PePcgW+ZF6b8lI0b6KRdR/i+/0IhrNTWG+CoClHeO
 MWn+UIuup2DhWKsLGa4oXi5CD6fOyEbS05sitMNDvAt12fj+pYnol93VXvbRZzzW
 mG0KIZcIjXZ5gW6lQk9nSo4C2y1a4eRq5xSvTyDcPAhuK2+Ytso70p9G9LQXi6pE
 2vdRkhv0x/7q95yYhQFvbVZ+0945w/jO7mBauDciXzIorNQAT+Ivr+b2AbMTfHZn
 ZEKgC8EiUBx49+QI11fT7rW3mNPWEF7FXN8tT/jq1Tf/x9tPkoJpnyb1CnZlGUxk
 UtPypbqnHzilFEgLBuZ4iMmqlGJyj47aarTwGyYjl4QV8SZILozTyfIpAu57FTqh
 DODsuDli9fgoL3AiKObhPg4qJ7WgPZ4XqYm8cJiAVpc7NeO/YW86P+FglLUC/XsK
 QtVP8GXyrQh0tMFRv1ucU+RU3y9dvaJkUQDP12vcuvaHlzFoAVVomSlZJ/KY0JSW
 zpdw3if5wJbkUsfo7/aQMzLQBwi4pRpaARvgibYCSiZDX0iqBBQ=
 =y2Q6
 -----END PGP SIGNATURE-----

Merge tag 'for-6.17-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs fixes from David Sterba:
 "Several zoned mode fixes, mount option printing fixups, folio state
  handling fixes and one log replay fix.

   - zoned mode:
       - zone activation and finish fixes
       - block group reservation fixes

   - mount option fixes:
       - bring back printing of mount options with key=value that got
         accidentally dropped during mount option parsing in 6.8
       - fix inverse logic or typos when printing nodatasum/nodatacow

   - folio status fixes:
       - writeback fixes in zoned mode
       - properly reset dirty/writeback if submission fails
       - properly handle TOWRITE xarray mark/tag

   - do not set mtime/ctime to current time when unlinking for log
     replay"

* tag 'for-6.17-rc2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux:
  btrfs: fix printing of mount info messages for NODATACOW/NODATASUM
  btrfs: restore mount option info messages during mount
  btrfs: fix incorrect log message for nobarrier mount option
  btrfs: fix buffer index in wait_eb_writebacks()
  btrfs: subpage: keep TOWRITE tag until folio is cleaned
  btrfs: clear TAG_TOWRITE from buffer tree when submitting a tree block
  btrfs: do not set mtime/ctime to current time when unlinking for log replay
  btrfs: clear block dirty if btrfs_writepage_cow_fixup() failed
  btrfs: clear block dirty if submit_one_sector() failed
  btrfs: zoned: limit active zones to max_open_zones
  btrfs: zoned: fix write time activation failure for metadata block group
  btrfs: zoned: fix data relocation block group reservation
  btrfs: zoned: skip ZONE FINISH of conventional zones
pull/1348/head
Linus Torvalds 2025-08-18 09:17:42 -07:00
commit be48bcf004
5 changed files with 163 additions and 55 deletions

View File

@ -1512,7 +1512,7 @@ out:
/*
* Return 0 if we have submitted or queued the sector for submission.
* Return <0 for critical errors.
* Return <0 for critical errors, and the sector will have its dirty flag cleared.
*
* Caller should make sure filepos < i_size and handle filepos >= i_size case.
*/
@ -1535,8 +1535,17 @@ static int submit_one_sector(struct btrfs_inode *inode,
ASSERT(filepos < i_size);
em = btrfs_get_extent(inode, NULL, filepos, sectorsize);
if (IS_ERR(em))
if (IS_ERR(em)) {
/*
* When submission failed, we should still clear the folio dirty.
* Or the folio will be written back again but without any
* ordered extent.
*/
btrfs_folio_clear_dirty(fs_info, folio, filepos, sectorsize);
btrfs_folio_set_writeback(fs_info, folio, filepos, sectorsize);
btrfs_folio_clear_writeback(fs_info, folio, filepos, sectorsize);
return PTR_ERR(em);
}
extent_offset = filepos - em->start;
em_end = btrfs_extent_map_end(em);
@ -1609,8 +1618,12 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
folio_unlock(folio);
return 1;
}
if (ret < 0)
if (ret < 0) {
btrfs_folio_clear_dirty(fs_info, folio, start, len);
btrfs_folio_set_writeback(fs_info, folio, start, len);
btrfs_folio_clear_writeback(fs_info, folio, start, len);
return ret;
}
for (cur = start; cur < start + len; cur += fs_info->sectorsize)
set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap);
@ -1666,8 +1679,8 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode,
* Here we set writeback and clear for the range. If the full folio
* is no longer dirty then we clear the PAGECACHE_TAG_DIRTY tag.
*
* If we hit any error, the corresponding sector will still be dirty
* thus no need to clear PAGECACHE_TAG_DIRTY.
* If we hit any error, the corresponding sector will have its dirty
* flag cleared and writeback finished, thus no need to handle the error case.
*/
if (!submitted_io && !error) {
btrfs_folio_set_writeback(fs_info, folio, start, len);
@ -1813,6 +1826,7 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e
xas_load(&xas);
xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK);
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);

View File

@ -4189,6 +4189,23 @@ int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
return ret;
}
static void update_time_after_link_or_unlink(struct btrfs_inode *dir)
{
struct timespec64 now;
/*
* If we are replaying a log tree, we do not want to update the mtime
* and ctime of the parent directory with the current time, since the
* log replay procedure is responsible for setting them to their correct
* values (the ones it had when the fsync was done).
*/
if (test_bit(BTRFS_FS_LOG_RECOVERING, &dir->root->fs_info->flags))
return;
now = inode_set_ctime_current(&dir->vfs_inode);
inode_set_mtime_to_ts(&dir->vfs_inode, now);
}
/*
* unlink helper that gets used here in inode.c and in the tree logging
* recovery code. It remove a link in a directory with a given name, and
@ -4289,7 +4306,7 @@ skip_backref:
inode_inc_iversion(&inode->vfs_inode);
inode_set_ctime_current(&inode->vfs_inode);
inode_inc_iversion(&dir->vfs_inode);
inode_set_mtime_to_ts(&dir->vfs_inode, inode_set_ctime_current(&dir->vfs_inode));
update_time_after_link_or_unlink(dir);
return btrfs_update_inode(trans, dir);
}
@ -6683,15 +6700,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans,
btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size +
name->len * 2);
inode_inc_iversion(&parent_inode->vfs_inode);
/*
* If we are replaying a log tree, we do not want to update the mtime
* and ctime of the parent directory with the current time, since the
* log replay procedure is responsible for setting them to their correct
* values (the ones it had when the fsync was done).
*/
if (!test_bit(BTRFS_FS_LOG_RECOVERING, &root->fs_info->flags))
inode_set_mtime_to_ts(&parent_inode->vfs_inode,
inode_set_ctime_current(&parent_inode->vfs_inode));
update_time_after_link_or_unlink(parent_inode);
ret = btrfs_update_inode(trans, parent_inode);
if (ret)

View File

@ -448,8 +448,25 @@ void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
spin_lock_irqsave(&bfs->lock, flags);
bitmap_set(bfs->bitmaps, start_bit, len >> fs_info->sectorsize_bits);
/*
* Don't clear the TOWRITE tag when starting writeback on a still-dirty
* folio. Doing so can cause WB_SYNC_ALL writepages() to overlook it,
* assume writeback is complete, and exit too early violating sync
* ordering guarantees.
*/
if (!folio_test_writeback(folio))
folio_start_writeback(folio);
__folio_start_writeback(folio, true);
if (!folio_test_dirty(folio)) {
struct address_space *mapping = folio_mapping(folio);
XA_STATE(xas, &mapping->i_pages, folio->index);
unsigned long flags;
xas_lock_irqsave(&xas, flags);
xas_load(&xas);
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
xas_unlock_irqrestore(&xas, flags);
}
spin_unlock_irqrestore(&bfs->lock, flags);
}

View File

@ -88,6 +88,9 @@ struct btrfs_fs_context {
refcount_t refs;
};
static void btrfs_emit_options(struct btrfs_fs_info *info,
struct btrfs_fs_context *old);
enum {
Opt_acl,
Opt_clear_cache,
@ -698,12 +701,9 @@ bool btrfs_check_options(const struct btrfs_fs_info *info,
if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) {
if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) {
btrfs_info(info, "disk space caching is enabled");
btrfs_warn(info,
"space cache v1 is being deprecated and will be removed in a future release, please use -o space_cache=v2");
}
if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE))
btrfs_info(info, "using free-space-tree");
}
return ret;
@ -980,6 +980,8 @@ static int btrfs_fill_super(struct super_block *sb,
return ret;
}
btrfs_emit_options(fs_info, NULL);
inode = btrfs_iget(BTRFS_FIRST_FREE_OBJECTID, fs_info->fs_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
@ -1437,7 +1439,7 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
{
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts");
btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum");
btrfs_info_if_set(info, old, NODATACOW, "setting nodatacow");
btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations");
btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme");
btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers");
@ -1459,10 +1461,11 @@ static void btrfs_emit_options(struct btrfs_fs_info *info,
btrfs_info_if_set(info, old, IGNOREMETACSUMS, "ignoring meta csums");
btrfs_info_if_set(info, old, IGNORESUPERFLAGS, "ignoring unknown super block flags");
btrfs_info_if_unset(info, old, NODATASUM, "setting datasum");
btrfs_info_if_unset(info, old, NODATACOW, "setting datacow");
btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations");
btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme");
btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers");
btrfs_info_if_unset(info, old, NOBARRIER, "turning on barriers");
btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log");
btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching");
btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree");

View File

@ -17,6 +17,7 @@
#include "accessors.h"
#include "bio.h"
#include "transaction.h"
#include "sysfs.h"
/* Maximum number of zones to report per blkdev_report_zones() call */
#define BTRFS_REPORT_NR_ZONES 4096
@ -42,6 +43,9 @@
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
/* Default number of max active zones when the device has no limits. */
#define BTRFS_DEFAULT_MAX_ACTIVE_ZONES 128
/*
* Minimum of active zones we need:
*
@ -416,7 +420,10 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
max_active_zones = bdev_max_active_zones(bdev);
max_active_zones = min_not_zero(bdev_max_active_zones(bdev),
bdev_max_open_zones(bdev));
if (!max_active_zones && zone_info->nr_zones > BTRFS_DEFAULT_MAX_ACTIVE_ZONES)
max_active_zones = BTRFS_DEFAULT_MAX_ACTIVE_ZONES;
if (max_active_zones && max_active_zones < BTRFS_MIN_ACTIVE_ZONES) {
btrfs_err(fs_info,
"zoned: %s: max active zones %u is too small, need at least %u active zones",
@ -2168,10 +2175,15 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
goto out_unlock;
}
/* No space left */
if (btrfs_zoned_bg_is_full(block_group)) {
ret = false;
goto out_unlock;
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) {
/* The caller should check if the block group is full. */
if (WARN_ON_ONCE(btrfs_zoned_bg_is_full(block_group))) {
ret = false;
goto out_unlock;
}
} else {
/* Since it is already written, it should have been active. */
WARN_ON_ONCE(block_group->meta_write_pointer != block_group->start);
}
for (i = 0; i < map->num_stripes; i++) {
@ -2230,7 +2242,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
struct btrfs_fs_info *fs_info = block_group->fs_info;
const u64 end = block_group->start + block_group->length;
struct extent_buffer *eb;
unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits);
unsigned long index, start = (block_group->start >> fs_info->nodesize_bits);
rcu_read_lock();
xa_for_each_start(&fs_info->buffer_tree, index, eb, start) {
@ -2245,6 +2257,40 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group)
rcu_read_unlock();
}
static int call_zone_finish(struct btrfs_block_group *block_group,
struct btrfs_io_stripe *stripe)
{
struct btrfs_device *device = stripe->dev;
const u64 physical = stripe->physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
int ret;
if (!device->bdev)
return 0;
if (zinfo->max_active_zones == 0)
return 0;
if (btrfs_dev_is_sequential(device, physical)) {
unsigned int nofs_flags;
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
zinfo->zone_size >> SECTOR_SHIFT);
memalloc_nofs_restore(nofs_flags);
if (ret)
return ret;
}
if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
zinfo->reserved_active_zones++;
btrfs_dev_clear_active_zone(device, physical);
return 0;
}
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
@ -2329,31 +2375,12 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ
down_read(&dev_replace->rwsem);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
struct btrfs_zoned_device_info *zinfo = device->zone_info;
unsigned int nofs_flags;
if (!device->bdev)
continue;
if (zinfo->max_active_zones == 0)
continue;
nofs_flags = memalloc_nofs_save();
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
zinfo->zone_size >> SECTOR_SHIFT);
memalloc_nofs_restore(nofs_flags);
ret = call_zone_finish(block_group, &map->stripes[i]);
if (ret) {
up_read(&dev_replace->rwsem);
return ret;
}
if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
zinfo->reserved_active_zones++;
btrfs_dev_clear_active_zone(device, physical);
}
up_read(&dev_replace->rwsem);
@ -2504,12 +2531,12 @@ void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
{
struct btrfs_space_info *data_sinfo = fs_info->data_sinfo;
struct btrfs_space_info *space_info = data_sinfo->sub_group[0];
struct btrfs_space_info *space_info = data_sinfo;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *bg;
struct list_head *bg_list;
u64 alloc_flags;
bool initial = false;
bool first = true;
bool did_chunk_alloc = false;
int index;
int ret;
@ -2523,21 +2550,52 @@ void btrfs_zoned_reserve_data_reloc_bg(struct btrfs_fs_info *fs_info)
if (sb_rdonly(fs_info->sb))
return;
ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags);
index = btrfs_bg_flags_to_raid_index(alloc_flags);
bg_list = &data_sinfo->block_groups[index];
/* Scan the data space_info to find empty block groups. Take the second one. */
again:
bg_list = &space_info->block_groups[index];
list_for_each_entry(bg, bg_list, list) {
if (bg->used > 0)
if (bg->alloc_offset != 0)
continue;
if (!initial) {
initial = true;
if (first) {
first = false;
continue;
}
if (space_info == data_sinfo) {
/* Migrate the block group to the data relocation space_info. */
struct btrfs_space_info *reloc_sinfo = data_sinfo->sub_group[0];
int factor;
ASSERT(reloc_sinfo->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
factor = btrfs_bg_type_to_factor(bg->flags);
down_write(&space_info->groups_sem);
list_del_init(&bg->list);
/* We can assume this as we choose the second empty one. */
ASSERT(!list_empty(&space_info->block_groups[index]));
up_write(&space_info->groups_sem);
spin_lock(&space_info->lock);
space_info->total_bytes -= bg->length;
space_info->disk_total -= bg->length * factor;
/* There is no allocation ever happened. */
ASSERT(bg->used == 0);
ASSERT(bg->zone_unusable == 0);
/* No super block in a block group on the zoned setup. */
ASSERT(bg->bytes_super == 0);
spin_unlock(&space_info->lock);
bg->space_info = reloc_sinfo;
if (reloc_sinfo->block_group_kobjs[index] == NULL)
btrfs_sysfs_add_block_group_type(bg);
btrfs_add_bg_to_space_info(fs_info, bg);
}
fs_info->data_reloc_bg = bg->start;
set_bit(BLOCK_GROUP_FLAG_ZONED_DATA_RELOC, &bg->runtime_flags);
btrfs_zone_activate(bg);
@ -2552,11 +2610,18 @@ again:
if (IS_ERR(trans))
return;
/* Allocate new BG in the data relocation space_info. */
space_info = data_sinfo->sub_group[0];
ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC);
ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE);
btrfs_end_transaction(trans);
if (ret == 1) {
/*
* We allocated a new block group in the data relocation space_info. We
* can take that one.
*/
first = false;
did_chunk_alloc = true;
bg_list = &space_info->block_groups[index];
goto again;
}
}