From fcd9531b305288dc2848d38567d466af4ee147b2 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 15 Feb 2023 12:59:49 -0800 Subject: [PATCH 1/8] btrfs: sysfs: add size class stats Make it possible to see the distribution of size classes for block groups. Helpful for testing and debugging the allocator w.r.t. to size classes. The new stats can be found at the path: /sys/fs/btrfs//allocation//size_class but they will only be non-zero for bg-type = data. Signed-off-by: Boris Burkov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8c5efa5813b3..37fc58a7f27e 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include "messages.h" #include "ctree.h" @@ -778,6 +779,45 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj, return len; } +static ssize_t btrfs_size_classes_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + struct btrfs_block_group *bg; + u32 none = 0; + u32 small = 0; + u32 medium = 0; + u32 large = 0; + + for (int i = 0; i < BTRFS_NR_RAID_TYPES; ++i) { + down_read(&sinfo->groups_sem); + list_for_each_entry(bg, &sinfo->block_groups[i], list) { + if (!btrfs_block_group_should_use_size_class(bg)) + continue; + switch (bg->size_class) { + case BTRFS_BG_SZ_NONE: + none++; + break; + case BTRFS_BG_SZ_SMALL: + small++; + break; + case BTRFS_BG_SZ_MEDIUM: + medium++; + break; + case BTRFS_BG_SZ_LARGE: + large++; + break; + } + } + up_read(&sinfo->groups_sem); + } + return sysfs_emit(buf, "none %u\n" + "small %u\n" + "medium %u\n" + "large %u\n", + none, small, medium, large); +} + #ifdef CONFIG_BTRFS_DEBUG /* * Request chunk allocation with current chunk size. @@ -835,6 +875,7 @@ SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); +BTRFS_ATTR(space_info, size_classes, btrfs_size_classes_show); static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, struct kobj_attribute *a, @@ -887,6 +928,7 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, disk_total), BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), BTRFS_ATTR_PTR(space_info, chunk_size), + BTRFS_ATTR_PTR(space_info, size_classes), #ifdef CONFIG_BTRFS_DEBUG BTRFS_ATTR_PTR(space_info, force_chunk_alloc), #endif From 12148367d7235a015c7a3ab20bc189a34f63688e Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Wed, 15 Feb 2023 12:59:50 -0800 Subject: [PATCH 2/8] btrfs: fix potential dead lock in size class loading logic As reported by Filipe, there's a potential deadlock caused by using btrfs_search_forward on commit_root. The locking there is unconditional, even if ->skip_locking and ->search_commit_root is set. It's not meant to be used for commit roots, so it always needs to do locking. So if another task is COWing a child node of the same root node and then needs to wait for block group caching to complete when trying to allocate a metadata extent, it deadlocks. For example: [539604.239315] sysrq: Show Blocked State [539604.240133] task:kworker/u16:6 state:D stack:0 pid:2119594 ppid:2 flags:0x00004000 [539604.241613] Workqueue: btrfs-cache btrfs_work_helper [btrfs] [539604.242673] Call Trace: [539604.243129] [539604.243925] __schedule+0x41d/0xee0 [539604.244797] ? rcu_read_lock_sched_held+0x12/0x70 [539604.245399] ? rwsem_down_read_slowpath+0x185/0x490 [539604.246111] schedule+0x5d/0xf0 [539604.246593] rwsem_down_read_slowpath+0x2da/0x490 [539604.247290] ? rcu_barrier_tasks_trace+0x10/0x20 [539604.248090] __down_read_common+0x3d/0x150 [539604.248702] down_read_nested+0xc3/0x140 [539604.249280] __btrfs_tree_read_lock+0x24/0x100 [btrfs] [539604.250097] btrfs_read_lock_root_node+0x48/0x60 [btrfs] [539604.250915] btrfs_search_forward+0x59/0x460 [btrfs] [539604.251781] ? btrfs_global_root+0x50/0x70 [btrfs] [539604.252476] caching_thread+0x1be/0x920 [btrfs] [539604.253167] btrfs_work_helper+0xf6/0x400 [btrfs] [539604.253848] process_one_work+0x24f/0x5a0 [539604.254476] worker_thread+0x52/0x3b0 [539604.255166] ? __pfx_worker_thread+0x10/0x10 [539604.256047] kthread+0xf0/0x120 [539604.256591] ? __pfx_kthread+0x10/0x10 [539604.257212] ret_from_fork+0x29/0x50 [539604.257822] [539604.258233] task:btrfs-transacti state:D stack:0 pid:2236474 ppid:2 flags:0x00004000 [539604.259802] Call Trace: [539604.260243] [539604.260615] __schedule+0x41d/0xee0 [539604.261205] ? rcu_read_lock_sched_held+0x12/0x70 [539604.262000] ? rwsem_down_read_slowpath+0x185/0x490 [539604.262822] schedule+0x5d/0xf0 [539604.263374] rwsem_down_read_slowpath+0x2da/0x490 [539604.266228] ? lock_acquire+0x160/0x310 [539604.266917] ? rcu_read_lock_sched_held+0x12/0x70 [539604.267996] ? lock_contended+0x19e/0x500 [539604.268720] __down_read_common+0x3d/0x150 [539604.269400] down_read_nested+0xc3/0x140 [539604.270057] __btrfs_tree_read_lock+0x24/0x100 [btrfs] [539604.271129] btrfs_read_lock_root_node+0x48/0x60 [btrfs] [539604.272372] btrfs_search_slot+0x143/0xf70 [btrfs] [539604.273295] update_block_group_item+0x9e/0x190 [btrfs] [539604.274282] btrfs_start_dirty_block_groups+0x1c4/0x4f0 [btrfs] [539604.275381] ? __mutex_unlock_slowpath+0x45/0x280 [539604.276390] btrfs_commit_transaction+0xee/0xed0 [btrfs] [539604.277391] ? lock_acquire+0x1a4/0x310 [539604.278080] ? start_transaction+0xcb/0x6c0 [btrfs] [539604.279099] transaction_kthread+0x142/0x1c0 [btrfs] [539604.279996] ? __pfx_transaction_kthread+0x10/0x10 [btrfs] [539604.280673] kthread+0xf0/0x120 [539604.281050] ? __pfx_kthread+0x10/0x10 [539604.281496] ret_from_fork+0x29/0x50 [539604.281966] [539604.282255] task:fsstress state:D stack:0 pid:2236483 ppid:1 flags:0x00004006 [539604.283897] Call Trace: [539604.284700] [539604.285088] __schedule+0x41d/0xee0 [539604.285660] schedule+0x5d/0xf0 [539604.286175] btrfs_wait_block_group_cache_progress+0xf2/0x170 [btrfs] [539604.287342] ? __pfx_autoremove_wake_function+0x10/0x10 [539604.288450] find_free_extent+0xd93/0x1750 [btrfs] [539604.289256] ? _raw_spin_unlock+0x29/0x50 [539604.289911] ? btrfs_get_alloc_profile+0x127/0x2a0 [btrfs] [539604.290843] btrfs_reserve_extent+0x147/0x290 [btrfs] [539604.291943] btrfs_alloc_tree_block+0xcb/0x3e0 [btrfs] [539604.292903] __btrfs_cow_block+0x138/0x580 [btrfs] [539604.293773] btrfs_cow_block+0x10e/0x240 [btrfs] [539604.294595] btrfs_search_slot+0x7f3/0xf70 [btrfs] [539604.295585] btrfs_update_device+0x71/0x1b0 [btrfs] [539604.296459] btrfs_chunk_alloc_add_chunk_item+0xe0/0x340 [btrfs] [539604.297489] btrfs_chunk_alloc+0x1bf/0x490 [btrfs] [539604.298335] find_free_extent+0x6fa/0x1750 [btrfs] [539604.299174] ? _raw_spin_unlock+0x29/0x50 [539604.299950] ? btrfs_get_alloc_profile+0x127/0x2a0 [btrfs] [539604.300918] btrfs_reserve_extent+0x147/0x290 [btrfs] [539604.301797] btrfs_alloc_tree_block+0xcb/0x3e0 [btrfs] [539604.303017] ? lock_release+0x224/0x4a0 [539604.303855] __btrfs_cow_block+0x138/0x580 [btrfs] [539604.304789] btrfs_cow_block+0x10e/0x240 [btrfs] [539604.305611] btrfs_search_slot+0x7f3/0xf70 [btrfs] [539604.306682] ? btrfs_global_root+0x50/0x70 [btrfs] [539604.308198] lookup_inline_extent_backref+0x17b/0x7a0 [btrfs] [539604.309254] lookup_extent_backref+0x43/0xd0 [btrfs] [539604.310122] __btrfs_free_extent+0xf8/0x810 [btrfs] [539604.310874] ? lock_release+0x224/0x4a0 [539604.311724] ? btrfs_merge_delayed_refs+0x17b/0x1d0 [btrfs] [539604.313023] __btrfs_run_delayed_refs+0x2ba/0x1260 [btrfs] [539604.314271] btrfs_run_delayed_refs+0x8f/0x1c0 [btrfs] [539604.315445] ? rcu_read_lock_sched_held+0x12/0x70 [539604.316706] btrfs_commit_transaction+0xa2/0xed0 [btrfs] [539604.317855] ? do_raw_spin_unlock+0x4b/0xa0 [539604.318544] ? _raw_spin_unlock+0x29/0x50 [539604.319240] create_subvol+0x53d/0x6e0 [btrfs] [539604.320283] btrfs_mksubvol+0x4f5/0x590 [btrfs] [539604.321220] __btrfs_ioctl_snap_create+0x11b/0x180 [btrfs] [539604.322307] btrfs_ioctl_snap_create_v2+0xc6/0x150 [btrfs] [539604.323295] btrfs_ioctl+0x9f7/0x33e0 [btrfs] [539604.324331] ? rcu_read_lock_sched_held+0x12/0x70 [539604.325137] ? lock_release+0x224/0x4a0 [539604.325808] ? __x64_sys_ioctl+0x87/0xc0 [539604.326467] __x64_sys_ioctl+0x87/0xc0 [539604.327109] do_syscall_64+0x38/0x90 [539604.327875] entry_SYSCALL_64_after_hwframe+0x72/0xdc [539604.328792] RIP: 0033:0x7f05a7babaeb This needs to use regular btrfs_search_slot() with some skip and stop logic. Since we only consider five samples (five search slots), don't bother with the complexity of looking for commit_root_sem contention. If necessary, it can be added to the load function in between samples. Reported-by: Filipe Manana Link: https://lore.kernel.org/linux-btrfs/CAL3q7H7eKMD44Z1+=Kb-1RFMMeZpAm2fwyO59yeBwCcSOU80Pg@mail.gmail.com/ Fixes: c7eec3d9aa95 ("btrfs: load block group size class when caching") Signed-off-by: Boris Burkov Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 42 ++++++++++++++++-------------------------- 1 file changed, 16 insertions(+), 26 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 5b10401d803b..05102a55710c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -558,14 +558,15 @@ u64 add_new_free_space(struct btrfs_block_group *block_group, u64 start, u64 end static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group, int index, int max_index, - struct btrfs_key *key) + struct btrfs_key *found_key) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - int ret = 0; u64 search_offset; u64 search_end = block_group->start + block_group->length; struct btrfs_path *path; + struct btrfs_key search_key; + int ret = 0; ASSERT(index >= 0); ASSERT(index <= max_index); @@ -585,37 +586,24 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ path->reada = READA_FORWARD; search_offset = index * div_u64(block_group->length, max_index); - key->objectid = block_group->start + search_offset; - key->type = BTRFS_EXTENT_ITEM_KEY; - key->offset = 0; + search_key.objectid = block_group->start + search_offset; + search_key.type = BTRFS_EXTENT_ITEM_KEY; + search_key.offset = 0; - while (1) { - ret = btrfs_search_forward(extent_root, key, path, 0); - if (ret != 0) - goto out; + btrfs_for_each_slot(extent_root, &search_key, found_key, path, ret) { /* Success; sampled an extent item in the block group */ - if (key->type == BTRFS_EXTENT_ITEM_KEY && - key->objectid >= block_group->start && - key->objectid + key->offset <= search_end) - goto out; + if (found_key->type == BTRFS_EXTENT_ITEM_KEY && + found_key->objectid >= block_group->start && + found_key->objectid + found_key->offset <= search_end) + break; /* We can't possibly find a valid extent item anymore */ - if (key->objectid >= search_end) { + if (found_key->objectid >= search_end) { ret = 1; break; } - if (key->type < BTRFS_EXTENT_ITEM_KEY) - key->type = BTRFS_EXTENT_ITEM_KEY; - else - key->objectid++; - btrfs_release_path(path); - up_read(&fs_info->commit_root_sem); - mutex_unlock(&caching_ctl->mutex); - cond_resched(); - mutex_lock(&caching_ctl->mutex); - down_read(&fs_info->commit_root_sem); } -out: + lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -659,6 +647,7 @@ out: static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl, struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_key key; int i; u64 min_size = block_group->length; @@ -668,6 +657,8 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl if (!btrfs_block_group_should_use_size_class(block_group)) return 0; + lockdep_assert_held(&caching_ctl->mutex); + lockdep_assert_held_read(&fs_info->commit_root_sem); for (i = 0; i < 5; ++i) { ret = sample_block_group_extent_item(caching_ctl, block_group, i, 5, &key); if (ret < 0) @@ -682,7 +673,6 @@ static int load_block_group_size_class(struct btrfs_caching_control *caching_ctl block_group->size_class = size_class; spin_unlock(&block_group->lock); } - out: return ret; } From 2943868a909f1d526da363dc077fd7b578643f4b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 11 Feb 2023 19:53:05 +0800 Subject: [PATCH 3/8] btrfs: ioctl: return device fsid from DEV_INFO ioctl Currently user space utilizes dev info ioctl to grab the info of a certain devid, this includes its device uuid. But the returned info is not enough to determine if a device is a seed. Commit a26d60dedf9a ("btrfs: sysfs: add devinfo/fsid to retrieve actual fsid from the device") exports the same value in sysfs so this is for parity with ioctl. Add a new member, fsid, into btrfs_ioctl_dev_info_args, and populate the member with fsid value. This should not cause any compatibility problem, following the combinations: - Old user space, old kernel - Old user space, new kernel User space tool won't even check the new member. - New user space, old kernel The kernel won't touch the new member, and user space tool should zero out its argument, thus the new member is all zero. User space tool can then know the kernel doesn't support this fsid reporting, and falls back to whatever they can. - New user space, new kernel Go as planned. Would find the fsid member is no longer zero, and trust its value. Reviewed-by: Anand Jain Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 1 + include/uapi/linux/btrfs.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8ea557e22252..439a5bf5ebc6 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2859,6 +2859,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, di_args->bytes_used = btrfs_device_get_bytes_used(dev); di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); + memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); if (dev->name) strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path)); else diff --git a/include/uapi/linux/btrfs.h b/include/uapi/linux/btrfs.h index b4f0f9531119..ada0a489bf2b 100644 --- a/include/uapi/linux/btrfs.h +++ b/include/uapi/linux/btrfs.h @@ -245,7 +245,17 @@ struct btrfs_ioctl_dev_info_args { __u8 uuid[BTRFS_UUID_SIZE]; /* in/out */ __u64 bytes_used; /* out */ __u64 total_bytes; /* out */ - __u64 unused[379]; /* pad to 4k */ + /* + * Optional, out. + * + * Showing the fsid of the device, allowing user space to check if this + * device is a seeding one. + * + * Introduced in v6.3, thus user space still needs to check if kernel + * changed this value. Older kernel will not touch the values here. + */ + __u8 fsid[BTRFS_UUID_SIZE]; + __u64 unused[377]; /* pad to 4k */ __u8 path[BTRFS_DEVICE_PATH_NAME_MAX]; /* out */ }; From c06016a02a6e316d861f7dddd4b70419a47ded2f Mon Sep 17 00:00:00 2001 From: void0red Date: Sat, 18 Feb 2023 12:36:48 +0800 Subject: [PATCH 4/8] btrfs: handle btrfs_del_item errors in __btrfs_update_delayed_inode Even if the slot is already read out, we may still need to re-balance the tree, thus it can cause error in that btrfs_del_item() call and we need to handle it properly. Reviewed-by: Qu Wenruo Signed-off-by: void0red Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0095c6e4c3d1..6b457b010cbc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1048,7 +1048,7 @@ again: * so there is only one iref. The case that several irefs are * in the same item doesn't exist. */ - btrfs_del_item(trans, root, path); + ret = btrfs_del_item(trans, root, path); out: btrfs_release_delayed_iref(node); btrfs_release_path(path); From 98e8d36a26c2ed22f78316df7d4bf33e554b9f9f Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Mon, 13 Feb 2023 14:10:38 +0900 Subject: [PATCH 5/8] btrfs: fix unnecessary increment of read error stat on write error Current btrfs_log_dev_io_error() increases the read error count even if the erroneous IO is a WRITE request. This is because it forget to use "else if", and all the error WRITE requests counts as READ error as there is (of course) no REQ_RAHEAD bit set. Fixes: c3a62baf21ad ("btrfs: use chained bios when cloning") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Christoph Hellwig Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index d8b90f95b157..726592868e9c 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -287,7 +287,7 @@ static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev) if (btrfs_op(bio) == BTRFS_MAP_WRITE) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - if (!(bio->bi_opf & REQ_RAHEAD)) + else if (!(bio->bi_opf & REQ_RAHEAD)) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS); From 95cd356ca23c3807b5f3503687161e216b1c520d Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 21 Feb 2023 10:11:24 -0800 Subject: [PATCH 6/8] btrfs: fix percent calculation for bg reclaim message We have a report, that the info message for block-group reclaim is crossing the 100% used mark. This is happening as we were truncating the divisor for the division (the block_group->length) to a 32bit value. Fix this by using div64_u64() to not truncate the divisor. In the worst case, it can lead to a div by zero error and should be possible to trigger on 4 disks RAID0, and each device is large enough: $ mkfs.btrfs -f /dev/test/scratch[1234] -m raid1 -d raid0 btrfs-progs v6.1 [...] Filesystem size: 40.00GiB Block group profiles: Data: RAID0 4.00GiB <<< Metadata: RAID1 256.00MiB System: RAID1 8.00MiB Reported-by: Forza Link: https://lore.kernel.org/linux-btrfs/e99483.c11a58d.1863591ca52@tnonline.net/ Fixes: 5f93e776c673 ("btrfs: zoned: print unusable percentage when reclaiming block groups") CC: stable@vger.kernel.org # 5.15+ Reviewed-by: Anand Jain Reviewed-by: Qu Wenruo Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba [ add Qu's note ] Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 05102a55710c..0eec3084fb00 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1826,7 +1826,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used %llu%% unusable", - bg->start, div_u64(bg->used * 100, bg->length), + bg->start, + div64_u64(bg->used * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); ret = btrfs_relocate_chunk(fs_info, bg->start); From e4cc1483f35940c9288c332dd275f6fad485f8d2 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 27 Feb 2023 12:53:56 +0000 Subject: [PATCH 7/8] btrfs: fix extent map logging bit not cleared for split maps after dropping range At btrfs_drop_extent_map_range() we are clearing the EXTENT_FLAG_LOGGING bit on a 'flags' variable that was not initialized. This makes static checkers complain about it, so initialize the 'flags' variable before clearing the bit. In practice this has no consequences, because EXTENT_FLAG_LOGGING should not be set when btrfs_drop_extent_map_range() is called, as an fsync locks the inode in exclusive mode, locks the inode's mmap semaphore in exclusive mode too and it always flushes all delalloc. Also add a comment about why we clear EXTENT_FLAG_LOGGING on a copy of the flags of the split extent map. Reported-by: Dan Carpenter Link: https://lore.kernel.org/linux-btrfs/Y%2FyipSVozUDEZKow@kili/ Fixes: db21370bffbc ("btrfs: drop extent map range more efficiently") Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/extent_map.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index be94030e1dfb..138afa955370 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -763,7 +763,13 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto next; } + flags = em->flags; clear_bit(EXTENT_FLAG_PINNED, &em->flags); + /* + * In case we split the extent map, we want to preserve the + * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want + * it on the new extent maps. + */ clear_bit(EXTENT_FLAG_LOGGING, &flags); modified = !list_empty(&em->list); @@ -774,7 +780,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (em->start >= start && em_end <= end) goto remove_em; - flags = em->flags; gen = em->generation; compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); From 675dfe1223a69e270b3d52cb0211c8a501455cec Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 6 Mar 2023 10:13:34 +0000 Subject: [PATCH 8/8] btrfs: fix block group item corruption after inserting new block group We can often end up inserting a block group item, for a new block group, with a wrong value for the used bytes field. This happens if for the new allocated block group, in the same transaction that created the block group, we have tasks allocating extents from it as well as tasks removing extents from it. For example: 1) Task A creates a metadata block group X; 2) Two extents are allocated from block group X, so its "used" field is updated to 32K, and its "commit_used" field remains as 0; 3) Transaction commit starts, by some task B, and it enters btrfs_start_dirty_block_groups(). There it tries to update the block group item for block group X, which currently has its "used" field with a value of 32K. But that fails since the block group item was not yet inserted, and so on failure update_block_group_item() sets the "commit_used" field of the block group back to 0; 4) The block group item is inserted by task A, when for example btrfs_create_pending_block_groups() is called when releasing its transaction handle. This results in insert_block_group_item() inserting the block group item in the extent tree (or block group tree), with a "used" field having a value of 32K, but without updating the "commit_used" field in the block group, which remains with value of 0; 5) The two extents are freed from block X, so its "used" field changes from 32K to 0; 6) The transaction commit by task B continues, it enters btrfs_write_dirty_block_groups() which calls update_block_group_item() for block group X, and there it decides to skip the block group item update, because "used" has a value of 0 and "commit_used" has a value of 0 too. As a result, we end up with a block item having a 32K "used" field but no extents allocated from it. When this issue happens, a btrfs check reports an error like this: [1/7] checking root items [2/7] checking extents block group [1104150528 1073741824] used 39796736 but extent items used 0 ERROR: errors found in extent allocation tree or chunk allocation (...) Fix this by making insert_block_group_item() update the block group's "commit_used" field. Fixes: 7248e0cebbef ("btrfs: skip update of block group item if used bytes are the same") CC: stable@vger.kernel.org # 6.2+ Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0eec3084fb00..0ef8b8926bfa 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2484,18 +2484,29 @@ static int insert_block_group_item(struct btrfs_trans_handle *trans, struct btrfs_block_group_item bgi; struct btrfs_root *root = btrfs_block_group_root(fs_info); struct btrfs_key key; + u64 old_commit_used; + int ret; spin_lock(&block_group->lock); btrfs_set_stack_block_group_used(&bgi, block_group->used); btrfs_set_stack_block_group_chunk_objectid(&bgi, block_group->global_root_id); btrfs_set_stack_block_group_flags(&bgi, block_group->flags); + old_commit_used = block_group->commit_used; + block_group->commit_used = block_group->used; key.objectid = block_group->start; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; key.offset = block_group->length; spin_unlock(&block_group->lock); - return btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + ret = btrfs_insert_item(trans, root, &key, &bgi, sizeof(bgi)); + if (ret < 0) { + spin_lock(&block_group->lock); + block_group->commit_used = old_commit_used; + spin_unlock(&block_group->lock); + } + + return ret; } static int insert_dev_extent(struct btrfs_trans_handle *trans,