diff options
Diffstat (limited to 'fs/btrfs')
70 files changed, 7862 insertions, 6541 deletions
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 0a0d0eccee4e..548d6a5477b4 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu) return acl; } -static int __btrfs_set_acl(struct btrfs_trans_handle *trans, - struct user_namespace *mnt_userns, - struct inode *inode, struct posix_acl *acl, int type) +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, if (ret) return ret; } - ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type); + ret = __btrfs_set_acl(NULL, inode, acl, type); if (ret) inode->i_mode = old_mode; return ret; } - -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) -{ - struct posix_acl *default_acl, *acl; - int ret = 0; - - /* this happens with subvols */ - if (!dir) - return 0; - - ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl); - if (ret) - return ret; - - if (default_acl) { - ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl, - ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } - - if (acl) { - if (!ret) - ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl, - ACL_TYPE_ACCESS); - posix_acl_release(acl); - } - - if (!default_acl && !acl) - cache_no_acl(inode); - return ret; -} diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 43c89952b7d2..aac240430efe 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -15,13 +15,12 @@ enum { WORK_DONE_BIT, WORK_ORDER_DONE_BIT, - WORK_HIGH_PRIO_BIT, }; #define NO_THRESHOLD (-1) #define DFT_THRESHOLD (32) -struct __btrfs_workqueue { +struct btrfs_workqueue { struct workqueue_struct *normal_wq; /* File system this workqueue services */ @@ -48,12 +47,7 @@ struct __btrfs_workqueue { spinlock_t thres_lock; }; -struct btrfs_workqueue { - struct __btrfs_workqueue *normal; - struct __btrfs_workqueue *high; -}; - -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq) +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq) { return wq->fs_info; } @@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work) bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq) { /* - * We could compare wq->normal->pending with num_online_cpus() + * We could compare wq->pending with num_online_cpus() * to support "thresh == NO_THRESHOLD" case, but it requires * moving up atomic_inc/dec in thresh_queue/exec_hook. Let's * postpone it until someone needs the support of that case. */ - if (wq->normal->thresh == NO_THRESHOLD) + if (wq->thresh == NO_THRESHOLD) return false; - return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2; + return atomic_read(&wq->pending) > wq->thresh * 2; } -static struct __btrfs_workqueue * -__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, - unsigned int flags, int limit_active, int thresh) +struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, + const char *name, unsigned int flags, + int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, ret->thresh = thresh; } - if (flags & WQ_HIGHPRI) - ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags, - ret->current_active, name); - else - ret->normal_wq = alloc_workqueue("btrfs-%s", flags, - ret->current_active, name); + ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active, + name); if (!ret->normal_wq) { kfree(ret); return NULL; @@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name, INIT_LIST_HEAD(&ret->ordered_list); spin_lock_init(&ret->list_lock); spin_lock_init(&ret->thres_lock); - trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI); - return ret; -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq); - -struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, - const char *name, - unsigned int flags, - int limit_active, - int thresh) -{ - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); - - if (!ret) - return NULL; - - ret->normal = __btrfs_alloc_workqueue(fs_info, name, - flags & ~WQ_HIGHPRI, - limit_active, thresh); - if (!ret->normal) { - kfree(ret); - return NULL; - } - - if (flags & WQ_HIGHPRI) { - ret->high = __btrfs_alloc_workqueue(fs_info, name, flags, - limit_active, thresh); - if (!ret->high) { - __btrfs_destroy_workqueue(ret->normal); - kfree(ret); - return NULL; - } - } + trace_btrfs_workqueue_alloc(ret, name); return ret; } @@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, * This hook WILL be called in IRQ handler context, * so workqueue_set_max_active MUST NOT be called in this hook */ -static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) +static inline void thresh_queue_hook(struct btrfs_workqueue *wq) { if (wq->thresh == NO_THRESHOLD) return; @@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq) * This hook is called in kthread content. * So workqueue_set_max_active is called here. */ -static inline void thresh_exec_hook(struct __btrfs_workqueue *wq) +static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; @@ -217,7 +173,7 @@ out: } } -static void run_ordered_work(struct __btrfs_workqueue *wq, +static void run_ordered_work(struct btrfs_workqueue *wq, struct btrfs_work *self) { struct list_head *list = &wq->ordered_list; @@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) { struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq = work->wq; int need_order = 0; /* @@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work) */ if (work->ordered_func) need_order = 1; - wq = work->wq; trace_btrfs_work_sched(work); thresh_exec_hook(wq); @@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func, work->flags = 0; } -static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, - struct btrfs_work *work) +void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work) { unsigned long flags; @@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq, queue_work(wq->normal_wq, &work->normal_work); } -void btrfs_queue_work(struct btrfs_workqueue *wq, - struct btrfs_work *work) -{ - struct __btrfs_workqueue *dest_wq; - - if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high) - dest_wq = wq->high; - else - dest_wq = wq->normal; - __btrfs_queue_work(dest_wq, work); -} - -static inline void -__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq) -{ - destroy_workqueue(wq->normal_wq); - trace_btrfs_workqueue_destroy(wq); - kfree(wq); -} - void btrfs_destroy_workqueue(struct btrfs_workqueue *wq) { if (!wq) return; - if (wq->high) - __btrfs_destroy_workqueue(wq->high); - __btrfs_destroy_workqueue(wq->normal); + destroy_workqueue(wq->normal_wq); + trace_btrfs_workqueue_destroy(wq); kfree(wq); } void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active) { - if (!wq) - return; - wq->normal->limit_active = limit_active; - if (wq->high) - wq->high->limit_active = limit_active; -} - -void btrfs_set_work_high_priority(struct btrfs_work *work) -{ - set_bit(WORK_HIGH_PRIO_BIT, &work->flags); + if (wq) + wq->limit_active = limit_active; } void btrfs_flush_workqueue(struct btrfs_workqueue *wq) { - if (wq->high) - flush_workqueue(wq->high->normal_wq); - - flush_workqueue(wq->normal->normal_wq); + flush_workqueue(wq->normal_wq); } diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index 3204daa51b95..6e2596ddae10 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -11,11 +11,8 @@ struct btrfs_fs_info; struct btrfs_workqueue; -/* Internal use only */ -struct __btrfs_workqueue; struct btrfs_work; typedef void (*btrfs_func_t)(struct btrfs_work *arg); -typedef void (*btrfs_work_func_t)(struct work_struct *arg); struct btrfs_work { btrfs_func_t func; @@ -25,7 +22,7 @@ struct btrfs_work { /* Don't touch things below */ struct work_struct normal_work; struct list_head ordered_list; - struct __btrfs_workqueue *wq; + struct btrfs_workqueue *wq; unsigned long flags; }; @@ -40,9 +37,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work); void btrfs_destroy_workqueue(struct btrfs_workqueue *wq); void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max); -void btrfs_set_work_high_priority(struct btrfs_work *work); struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work); -struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq); +struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq); bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq); void btrfs_flush_workqueue(struct btrfs_workqueue *wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ebc392ea1d74..d385357e19b6 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2028,10 +2028,29 @@ out: return ret; } +static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) +{ + struct btrfs_data_container *inodes = ctx; + const size_t c = 3 * sizeof(u64); + + if (inodes->bytes_left >= c) { + inodes->bytes_left -= c; + inodes->val[inodes->elem_cnt] = inum; + inodes->val[inodes->elem_cnt + 1] = offset; + inodes->val[inodes->elem_cnt + 2] = root; + inodes->elem_cnt += 3; + } else { + inodes->bytes_missing += c - inodes->bytes_left; + inodes->bytes_left = 0; + inodes->elem_missed += 3; + } + + return 0; +} + int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, - bool ignore_offset) + void *ctx, bool ignore_offset) { int ret; u64 extent_item_pos; @@ -2049,17 +2068,15 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, extent_item_pos = logical - found_key.objectid; ret = iterate_extent_inodes(fs_info, found_key.objectid, extent_item_pos, search_commit_root, - iterate, ctx, ignore_offset); + build_ino_list, ctx, ignore_offset); return ret; } -typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx); +static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, + struct extent_buffer *eb, struct inode_fs_paths *ipath); -static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_refs(u64 inum, struct inode_fs_paths *ipath) { int ret = 0; int slot; @@ -2068,6 +2085,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, u32 name_len; u64 parent = 0; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_ref *iref; struct btrfs_key found_key; @@ -2103,8 +2122,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, "following ref at offset %u for inode %llu in tree %llu", cur, found_key.objectid, fs_root->root_key.objectid); - ret = iterate(parent, name_len, - (unsigned long)(iref + 1), eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)(iref + 1), eb, ipath); if (ret) break; len = sizeof(*iref) + name_len; @@ -2118,15 +2137,15 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, - iterate_irefs_t *iterate, void *ctx) +static int iterate_inode_extrefs(u64 inum, struct inode_fs_paths *ipath) { int ret; int slot; u64 offset = 0; u64 parent; int found = 0; + struct btrfs_root *fs_root = ipath->fs_root; + struct btrfs_path *path = ipath->btrfs_path; struct extent_buffer *eb; struct btrfs_inode_extref *extref; u32 item_size; @@ -2162,8 +2181,8 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, extref = (struct btrfs_inode_extref *)(ptr + cur_offset); parent = btrfs_inode_extref_parent(eb, extref); name_len = btrfs_inode_extref_name_len(eb, extref); - ret = iterate(parent, name_len, - (unsigned long)&extref->name, eb, ctx); + ret = inode_to_path(parent, name_len, + (unsigned long)&extref->name, eb, ipath); if (ret) break; @@ -2180,34 +2199,13 @@ static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root, return ret; } -static int iterate_irefs(u64 inum, struct btrfs_root *fs_root, - struct btrfs_path *path, iterate_irefs_t *iterate, - void *ctx) -{ - int ret; - int found_refs = 0; - - ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx); - if (!ret) - ++found_refs; - else if (ret != -ENOENT) - return ret; - - ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx); - if (ret == -ENOENT && found_refs) - return 0; - - return ret; -} - /* * returns 0 if the path could be dumped (probably truncated) * returns <0 in case of an error */ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, - struct extent_buffer *eb, void *ctx) + struct extent_buffer *eb, struct inode_fs_paths *ipath) { - struct inode_fs_paths *ipath = ctx; char *fspath; char *fspath_min; int i = ipath->fspath->elem_cnt; @@ -2248,8 +2246,20 @@ static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off, */ int paths_from_inode(u64 inum, struct inode_fs_paths *ipath) { - return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path, - inode_to_path, ipath); + int ret; + int found_refs = 0; + + ret = iterate_inode_refs(inum, ipath); + if (!ret) + ++found_refs; + else if (ret != -ENOENT) + return ret; + + ret = iterate_inode_extrefs(inum, ipath); + if (ret == -ENOENT && found_refs) + return 0; + + return ret; } struct btrfs_data_container *init_data_container(u32 total_bytes) diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index ba454032dbe2..2759de7d324c 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -35,8 +35,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, bool ignore_offset); int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info, - struct btrfs_path *path, - iterate_extent_inodes_t *iterate, void *ctx, + struct btrfs_path *path, void *ctx, bool ignore_offset); int paths_from_inode(u64 inum, struct inode_fs_paths *ipath); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0dd6de994199..c3aecfb0a71d 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, struct rb_node **p; struct rb_node *parent = NULL; struct btrfs_block_group *cache; + bool leftmost = true; ASSERT(block_group->length != 0); - spin_lock(&info->block_group_cache_lock); - p = &info->block_group_cache_tree.rb_node; + write_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_root.rb_node; while (*p) { parent = *p; @@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, p = &(*p)->rb_left; } else if (block_group->start > cache->start) { p = &(*p)->rb_right; + leftmost = false; } else { - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return -EEXIST; } } rb_link_node(&block_group->cache_node, parent, p); - rb_insert_color(&block_group->cache_node, - &info->block_group_cache_tree); + rb_insert_color_cached(&block_group->cache_node, + &info->block_group_cache_tree, leftmost); - if (info->first_logical_byte > block_group->start) - info->first_logical_byte = block_group->start; - - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); return 0; } @@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search( struct rb_node *n; u64 end, start; - spin_lock(&info->block_group_cache_lock); - n = info->block_group_cache_tree.rb_node; + read_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_root.rb_node; while (n) { cache = rb_entry(n, struct btrfs_block_group, cache_node); @@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search( break; } } - if (ret) { + if (ret) btrfs_get_block_group(ret); - if (bytenr == 0 && info->first_logical_byte > ret->start) - info->first_logical_byte = ret->start; - } - spin_unlock(&info->block_group_cache_lock); + read_unlock(&info->block_group_cache_lock); return ret; } @@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group( struct btrfs_fs_info *fs_info = cache->fs_info; struct rb_node *node; - spin_lock(&fs_info->block_group_cache_lock); + read_lock(&fs_info->block_group_cache_lock); /* If our block group was removed, we need a full search. */ if (RB_EMPTY_NODE(&cache->cache_node)) { const u64 next_bytenr = cache->start + cache->length; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); btrfs_put_block_group(cache); - cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache; + return btrfs_lookup_first_block_group(fs_info, next_bytenr); } node = rb_next(&cache->cache_node); btrfs_put_block_group(cache); @@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group( btrfs_get_block_group(cache); } else cache = NULL; - spin_unlock(&fs_info->block_group_cache_lock); + read_unlock(&fs_info->block_group_cache_lock); return cache; } -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Check if we can do a NOCOW write for a given extent. + * + * @fs_info: The filesystem information object. + * @bytenr: Logical start address of the extent. + * + * Check if we can do a NOCOW write for the given extent, and increments the + * number of NOCOW writers in the block group that contains the extent, as long + * as the block group exists and it's currently not in read-only mode. + * + * Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller + * is responsible for calling btrfs_dec_nocow_writers() later. + * + * Or NULL if we can not do a NOCOW write + */ +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr) { struct btrfs_block_group *bg; - bool ret = true; + bool can_nocow = true; bg = btrfs_lookup_block_group(fs_info, bytenr); if (!bg) - return false; + return NULL; spin_lock(&bg->lock); if (bg->ro) - ret = false; + can_nocow = false; else atomic_inc(&bg->nocow_writers); spin_unlock(&bg->lock); - /* No put on block group, done by btrfs_dec_nocow_writers */ - if (!ret) + if (!can_nocow) { btrfs_put_block_group(bg); + return NULL; + } - return ret; + /* No put on block group, done by btrfs_dec_nocow_writers(). */ + return bg; } -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr) +/** + * Decrement the number of NOCOW writers in a block group. + * + * @bg: The block group. + * + * This is meant to be called after a previous call to btrfs_inc_nocow_writers(), + * and on the block group returned by that call. Typically this is called after + * creating an ordered extent for a NOCOW write, to prevent races with scrub and + * relocation. + * + * After this call, the caller should not use the block group anymore. It it wants + * to use it, then it should get a reference on it before calling this function. + */ +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg) { - struct btrfs_block_group *bg; - - bg = btrfs_lookup_block_group(fs_info, bytenr); - ASSERT(bg); if (atomic_dec_and_test(&bg->nocow_writers)) wake_up_var(&bg->nocow_writers); - /* - * Once for our lookup and once for the lookup done by a previous call - * to btrfs_inc_nocow_writers() - */ - btrfs_put_block_group(bg); + + /* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */ btrfs_put_block_group(bg); } @@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only cache->has_caching_ctl = 1; spin_unlock(&cache->lock); - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); refcount_inc(&caching_ctl->count); list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); btrfs_get_block_group(cache); @@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (ret) goto out; - spin_lock(&fs_info->block_group_cache_lock); - rb_erase(&block_group->cache_node, - &fs_info->block_group_cache_tree); + write_lock(&fs_info->block_group_cache_lock); + rb_erase_cached(&block_group->cache_node, + &fs_info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); /* Once for the block groups rbtree */ btrfs_put_block_group(block_group); - if (fs_info->first_logical_byte == block_group->start) - fs_info->first_logical_byte = (u64)-1; - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); /* @@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, if (block_group->cached == BTRFS_CACHE_STARTED) btrfs_wait_block_group_cache_done(block_group); if (block_group->has_caching_ctl) { - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); if (!caching_ctl) { struct btrfs_caching_control *ctl; @@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, } if (caching_ctl) list_del_init(&caching_ctl->list); - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); if (caching_ctl) { /* Once for the caching bgs list and once for us. */ btrfs_put_caching_control(caching_ctl); @@ -1033,8 +1051,13 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, < block_group->zone_unusable); WARN_ON(block_group->space_info->disk_total < block_group->length * factor); + WARN_ON(block_group->zone_is_active && + block_group->space_info->active_total_bytes + < block_group->length); } block_group->space_info->total_bytes -= block_group->length; + if (block_group->zone_is_active) + block_group->space_info->active_total_bytes -= block_group->length; block_group->space_info->bytes_readonly -= (block_group->length - block_group->zone_unusable); block_group->space_info->bytes_zone_unusable -= @@ -1367,6 +1390,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) goto next; } + ret = btrfs_zone_finish(block_group); + if (ret < 0) { + btrfs_dec_block_group_ro(block_group); + if (ret == -EAGAIN) + ret = 0; + goto next; + } + /* * Want to do this before we do anything else so we can recover * properly if we fail to join the transaction. @@ -1512,6 +1543,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a, return bg1->used > bg2->used; } +static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info) +{ + if (btrfs_is_zoned(fs_info)) + return btrfs_zoned_should_reclaim(fs_info); + return true; +} + void btrfs_reclaim_bgs_work(struct work_struct *work) { struct btrfs_fs_info *fs_info = @@ -1522,6 +1560,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) return; + if (!btrfs_should_reclaim(fs_info)) + return; + sb_start_write(fs_info->sb); if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { @@ -1692,35 +1733,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info, struct btrfs_root *root = btrfs_block_group_root(fs_info); int ret; struct btrfs_key found_key; - struct extent_buffer *leaf; - int slot; - - ret = btrfs_search_slot(NULL, root, key, path, 0, 0); - if (ret < 0) - return ret; - - while (1) { - slot = path->slots[0]; - leaf = path->nodes[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - break; - } - btrfs_item_key_to_cpu(leaf, &found_key, slot); + btrfs_for_each_slot(root, key, &found_key, path, ret) { if (found_key.objectid >= key->objectid && found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { - ret = read_bg_from_eb(fs_info, &found_key, path); - break; + return read_bg_from_eb(fs_info, &found_key, path); } - - path->slots[0]++; } -out: return ret; } @@ -1802,11 +1821,10 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, stripe_nr = physical - map->stripes[i].physical; stripe_nr = div64_u64_rem(stripe_nr, map->stripe_len, &offset); - if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)) { stripe_nr = stripe_nr * map->num_stripes + i; stripe_nr = div_u64(stripe_nr, map->sub_stripes); - } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_nr = stripe_nr * map->num_stripes + i; } /* * The remaining case would be for RAID56, multiply by @@ -2094,7 +2112,8 @@ static int read_one_block_group(struct btrfs_fs_info *info, trace_btrfs_add_block_group(info, cache, 0); btrfs_update_space_info(info, cache->flags, cache->length, cache->used, cache->bytes_super, - cache->zone_unusable, &space_info); + cache->zone_unusable, cache->zone_is_active, + &space_info); cache->space_info = space_info; @@ -2164,7 +2183,7 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) } btrfs_update_space_info(fs_info, bg->flags, em->len, em->len, - 0, 0, &space_info); + 0, 0, false, &space_info); bg->space_info = space_info; link_block_group(bg); @@ -2545,7 +2564,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran trace_btrfs_add_block_group(fs_info, cache, 1); btrfs_update_space_info(fs_info, cache->flags, size, bytes_used, cache->bytes_super, cache->zone_unusable, - &cache->space_info); + cache->zone_is_active, &cache->space_info); btrfs_update_global_block_rsv(fs_info); link_block_group(cache); @@ -2645,6 +2664,14 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; + /* + * We have allocated a new chunk. We also need to activate that chunk to + * grant metadata tickets for zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + if (ret < 0) + goto out; + ret = inc_block_group_ro(cache, 0); if (ret == -ETXTBSY) goto unlock_out; @@ -3220,6 +3247,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) return ret; } +static inline bool should_reclaim_block_group(struct btrfs_block_group *bg, + u64 bytes_freed) +{ + const struct btrfs_space_info *space_info = bg->space_info; + const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold); + const u64 new_val = bg->used; + const u64 old_val = new_val + bytes_freed; + u64 thresh; + + if (reclaim_thresh == 0) + return false; + + thresh = div_factor_fine(bg->length, reclaim_thresh); + + /* + * If we were below the threshold before don't reclaim, we are likely a + * brand new block group and we don't want to relocate new block groups. + */ + if (old_val < thresh) + return false; + if (new_val >= thresh) + return false; + return true; +} + int btrfs_update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, bool alloc) { @@ -3242,6 +3294,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&info->delalloc_root_lock); while (total) { + bool reclaim; + cache = btrfs_lookup_block_group(info, bytenr); if (!cache) { ret = -ENOENT; @@ -3287,6 +3341,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, cache->space_info, num_bytes); cache->space_info->bytes_used -= num_bytes; cache->space_info->disk_used -= num_bytes * factor; + + reclaim = should_reclaim_block_group(cache, num_bytes); spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); @@ -3313,6 +3369,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, if (!alloc && old_val == 0) { if (!btrfs_test_opt(info, DISCARD_ASYNC)) btrfs_mark_bg_unused(cache); + } else if (!alloc && reclaim) { + btrfs_mark_bg_to_reclaim(cache); } btrfs_put_block_group(cache); @@ -3716,6 +3774,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, * attempt. */ wait_for_alloc = true; + force = CHUNK_ALLOC_NO_FORCE; spin_unlock(&space_info->lock); mutex_lock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex); @@ -3839,6 +3898,14 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, ret = PTR_ERR(bg); } else { /* + * We have a new chunk. We also need to activate it for + * zoned filesystem. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, info, true); + if (ret < 0) + return; + + /* * If we fail to add the chunk item here, we end up * trying again at phase 2 of chunk allocation, at * btrfs_create_pending_block_groups(). So ignore @@ -3957,14 +4024,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) struct btrfs_caching_control *caching_ctl; struct rb_node *n; - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { caching_ctl = list_entry(info->caching_block_groups.next, struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); spin_lock(&info->unused_bgs_lock); while (!list_empty(&info->unused_bgs)) { @@ -3994,14 +4061,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->zone_active_bgs_lock); - spin_lock(&info->block_group_cache_lock); - while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + write_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, cache_node); - rb_erase(&block_group->cache_node, - &info->block_group_cache_tree); + rb_erase_cached(&block_group->cache_node, + &info->block_group_cache_tree); RB_CLEAR_NODE(&block_group->cache_node); - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); down_write(&block_group->space_info->groups_sem); list_del(&block_group->list); @@ -4024,9 +4091,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) ASSERT(block_group->swap_extents == 0); btrfs_put_block_group(block_group); - spin_lock(&info->block_group_cache_lock); + write_lock(&info->block_group_cache_lock); } - spin_unlock(&info->block_group_cache_lock); + write_unlock(&info->block_group_cache_lock); btrfs_release_global_block_rsv(info); diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index e8308f2ad07d..35e0e860cc0b 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -104,6 +104,7 @@ struct btrfs_block_group { unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; unsigned int zone_is_active:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; @@ -212,6 +213,8 @@ struct btrfs_block_group { u64 meta_write_pointer; struct map_lookup *physical_map; struct list_head active_bg_list; + struct work_struct zone_finish_work; + struct extent_buffer *last_eb; }; static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group) @@ -254,8 +257,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache); void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info, const u64 start); void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg); -bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); -void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr); +struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, + u64 bytenr); +void btrfs_dec_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_nocow_writers(struct btrfs_block_group *bg); void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache, u64 num_bytes); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index b3ee49b0b1e8..06be0644dd37 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -118,7 +118,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, if (block_rsv->reserved >= block_rsv->size) { num_bytes = block_rsv->reserved - block_rsv->size; block_rsv->reserved = block_rsv->size; - block_rsv->full = 1; + block_rsv->full = true; } else { num_bytes = 0; } @@ -142,7 +142,7 @@ static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, bytes_to_add = min(num_bytes, bytes_to_add); dest->reserved += bytes_to_add; if (dest->reserved >= dest->size) - dest->full = 1; + dest->full = true; num_bytes -= bytes_to_add; } spin_unlock(&dest->lock); @@ -171,7 +171,7 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src, return 0; } -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type) { memset(rsv, 0, sizeof(*rsv)); spin_lock_init(&rsv->lock); @@ -180,7 +180,7 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type) void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type) + enum btrfs_rsv_type type) { btrfs_init_block_rsv(rsv, type); rsv->space_info = btrfs_find_space_info(fs_info, @@ -188,7 +188,7 @@ void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, } struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type) + enum btrfs_rsv_type type) { struct btrfs_block_rsv *block_rsv; @@ -304,7 +304,7 @@ int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes) if (block_rsv->reserved >= num_bytes) { block_rsv->reserved -= num_bytes; if (block_rsv->reserved < block_rsv->size) - block_rsv->full = 0; + block_rsv->full = false; ret = 0; } spin_unlock(&block_rsv->lock); @@ -319,7 +319,7 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv, if (update_size) block_rsv->size += num_bytes; else if (block_rsv->reserved >= block_rsv->size) - block_rsv->full = 1; + block_rsv->full = true; spin_unlock(&block_rsv->lock); } @@ -341,7 +341,7 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info, } global_rsv->reserved -= num_bytes; if (global_rsv->reserved < global_rsv->size) - global_rsv->full = 0; + global_rsv->full = false; spin_unlock(&global_rsv->lock); btrfs_block_rsv_add_bytes(dest, num_bytes, true); @@ -408,10 +408,7 @@ void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info) btrfs_try_granting_tickets(fs_info, sinfo); } - if (block_rsv->reserved == block_rsv->size) - block_rsv->full = 1; - else - block_rsv->full = 0; + block_rsv->full = (block_rsv->reserved == block_rsv->size); if (block_rsv->size >= sinfo->total_bytes) sinfo->force_alloc = CHUNK_ALLOC_FORCE; diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index 3b67ff08d434..0c183709be00 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -9,7 +9,7 @@ enum btrfs_reserve_flush_enum; /* * Types of block reserves */ -enum { +enum btrfs_rsv_type { BTRFS_BLOCK_RSV_GLOBAL, BTRFS_BLOCK_RSV_DELALLOC, BTRFS_BLOCK_RSV_TRANS, @@ -25,9 +25,10 @@ struct btrfs_block_rsv { u64 reserved; struct btrfs_space_info *space_info; spinlock_t lock; - unsigned short full; - unsigned short type; - unsigned short failfast; + bool full; + bool failfast; + /* Block reserve type, one of BTRFS_BLOCK_RSV_* */ + enum btrfs_rsv_type type:8; /* * Qgroup equivalent for @size @reserved @@ -49,13 +50,13 @@ struct btrfs_block_rsv { u64 qgroup_rsv_reserved; }; -void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type); +void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, enum btrfs_rsv_type type); void btrfs_init_root_block_rsv(struct btrfs_root *root); struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info *fs_info, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv, - unsigned short type); + enum btrfs_rsv_type type); void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *rsv); int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 32131a5d321b..b160b8e124e0 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -279,19 +279,31 @@ static inline void btrfs_insert_inode_hash(struct inode *inode) __insert_inode_hash(inode, h); } +#if BITS_PER_LONG == 32 + +/* + * On 32 bit systems the i_ino of struct inode is 32 bits (unsigned long), so + * we use the inode's location objectid which is a u64 to avoid truncation. + */ static inline u64 btrfs_ino(const struct btrfs_inode *inode) { u64 ino = inode->location.objectid; - /* - * !ino: btree_inode - * type == BTRFS_ROOT_ITEM_KEY: subvol dir - */ - if (!ino || inode->location.type == BTRFS_ROOT_ITEM_KEY) + /* type == BTRFS_ROOT_ITEM_KEY: subvol dir */ + if (inode->location.type == BTRFS_ROOT_ITEM_KEY) ino = inode->vfs_inode.i_ino; return ino; } +#else + +static inline u64 btrfs_ino(const struct btrfs_inode *inode) +{ + return inode->vfs_inode.i_ino; +} + +#endif + static inline void btrfs_i_size_write(struct btrfs_inode *inode, u64 size) { i_size_write(&inode->vfs_inode, size); @@ -305,8 +317,7 @@ static inline bool btrfs_is_free_space_inode(struct btrfs_inode *inode) if (root == root->fs_info->tree_root && btrfs_ino(inode) != BTRFS_BTREE_INODE_OBJECTID) return true; - if (inode->location.objectid == BTRFS_FREE_INO_OBJECTID) - return true; + return false; } @@ -395,31 +406,6 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode) return true; } -struct btrfs_dio_private { - struct inode *inode; - - /* - * Since DIO can use anonymous page, we cannot use page_offset() to - * grab the file offset, thus need a dedicated member for file offset. - */ - u64 file_offset; - u64 disk_bytenr; - /* Used for bio::bi_size */ - u32 bytes; - - /* - * References to this structure. There is one reference per in-flight - * bio plus one while we're still setting up. - */ - refcount_t refs; - - /* dio_bio came from fs/direct-io.c */ - struct bio *dio_bio; - - /* Array of checksums */ - u8 csums[]; -}; - /* * btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two * separate u32s. These two functions convert between the two representations. diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index abac86a75840..98c6e5feab19 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -152,7 +152,7 @@ struct btrfsic_block { struct btrfsic_block *next_in_same_bio; void *orig_bio_private; bio_end_io_t *orig_bio_end_io; - int submit_bio_bh_rw; + blk_opf_t submit_bio_bh_rw; u64 flush_gen; /* only valid if !never_written */ }; @@ -1552,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state, return -ENOMEM; block_ctx->datav = block_ctx->mem_to_free; block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages); - for (i = 0; i < num_pages; i++) { - block_ctx->pagev[i] = alloc_page(GFP_NOFS); - if (!block_ctx->pagev[i]) - return -1; - } + ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev); + if (ret) + return ret; dev_bytenr = block_ctx->dev_bytenr; for (i = 0; i < num_pages;) { struct bio *bio; unsigned int j; - bio = btrfs_bio_alloc(num_pages - i); - bio_set_dev(bio, block_ctx->dev->bdev); + bio = bio_alloc(block_ctx->dev->bdev, num_pages - i, + REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = dev_bytenr >> 9; - bio->bi_opf = REQ_OP_READ; for (j = i; j < num_pages; j++) { ret = bio_add_page(bio, block_ctx->pagev[j], @@ -1684,7 +1681,7 @@ static void btrfsic_process_written_block(struct btrfsic_dev_state *dev_state, u64 dev_bytenr, char **mapped_datav, unsigned int num_pages, struct bio *bio, int *bio_is_patched, - int submit_bio_bh_rw) + blk_opf_t submit_bio_bh_rw) { int is_metadata; struct btrfsic_block *block; @@ -2033,7 +2030,7 @@ continue_loop: static void btrfsic_bio_end_io(struct bio *bp) { - struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private; + struct btrfsic_block *block = bp->bi_private; int iodone_w_error; /* mutex is not held! This is not save if IO is not yet completed @@ -2635,100 +2632,93 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev) &btrfsic_dev_state_hashtable); } -static void __btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - struct btrfsic_dev_state *dev_state; + unsigned int segs = bio_segments(bio); + u64 dev_bytenr = 512 * bio->bi_iter.bi_sector; + u64 cur_bytenr = dev_bytenr; + struct bvec_iter iter; + struct bio_vec bvec; + char **mapped_datav; + int bio_is_patched = 0; + int i = 0; + + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info( +"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", + bio_op(bio), bio->bi_opf, segs, + bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - if (!btrfsic_is_initialized) + mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS); + if (!mapped_datav) return; - mutex_lock(&btrfsic_mutex); - /* since btrfsic_submit_bio() is also called before - * btrfsic_mount(), this might return NULL */ - dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); - if (NULL != dev_state && - (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - int i = 0; - u64 dev_bytenr; - u64 cur_bytenr; - struct bio_vec bvec; - struct bvec_iter iter; - int bio_is_patched; - char **mapped_datav; - unsigned int segs = bio_segments(bio); - - dev_bytenr = 512 * bio->bi_iter.bi_sector; - bio_is_patched = 0; - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n", - bio_op(bio), bio->bi_opf, segs, - bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev); - - mapped_datav = kmalloc_array(segs, - sizeof(*mapped_datav), GFP_NOFS); - if (!mapped_datav) - goto leave; - cur_bytenr = dev_bytenr; - - bio_for_each_segment(bvec, bio, iter) { - BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = page_address(bvec.bv_page); - i++; - - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) - pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", - i, cur_bytenr, bvec.bv_len, bvec.bv_offset); - cur_bytenr += bvec.bv_len; - } - btrfsic_process_written_block(dev_state, dev_bytenr, - mapped_datav, segs, - bio, &bio_is_patched, - bio->bi_opf); - kfree(mapped_datav); - } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { - if (dev_state->state->print_mask & - BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) - pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", - bio_op(bio), bio->bi_opf, bio->bi_bdev); - if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) { - if ((dev_state->state->print_mask & - (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | - BTRFSIC_PRINT_MASK_VERBOSE))) - pr_info( -"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", - dev_state->bdev); - } else { - struct btrfsic_block *const block = - &dev_state->dummy_block_for_bio_bh_flush; + bio_for_each_segment(bvec, bio, iter) { + BUG_ON(bvec.bv_len != PAGE_SIZE); + mapped_datav[i] = page_address(bvec.bv_page); + i++; - block->is_iodone = 0; - block->never_written = 0; - block->iodone_w_error = 0; - block->flush_gen = dev_state->last_flush_gen + 1; - block->submit_bio_bh_rw = bio->bi_opf; - block->orig_bio_private = bio->bi_private; - block->orig_bio_end_io = bio->bi_end_io; - block->next_in_same_bio = NULL; - bio->bi_private = block; - bio->bi_end_io = btrfsic_bio_end_io; - } + if (dev_state->state->print_mask & + BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE) + pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n", + i, cur_bytenr, bvec.bv_len, bvec.bv_offset); + cur_bytenr += bvec.bv_len; } -leave: - mutex_unlock(&btrfsic_mutex); + + btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs, + bio, &bio_is_patched, bio->bi_opf); + kfree(mapped_datav); } -void btrfsic_submit_bio(struct bio *bio) +static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state) { - __btrfsic_submit_bio(bio); - submit_bio(bio); + if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH) + pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n", + bio_op(bio), bio->bi_opf, bio->bi_bdev); + + if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) { + struct btrfsic_block *const block = + &dev_state->dummy_block_for_bio_bh_flush; + + block->is_iodone = 0; + block->never_written = 0; + block->iodone_w_error = 0; + block->flush_gen = dev_state->last_flush_gen + 1; + block->submit_bio_bh_rw = bio->bi_opf; + block->orig_bio_private = bio->bi_private; + block->orig_bio_end_io = bio->bi_end_io; + block->next_in_same_bio = NULL; + bio->bi_private = block; + bio->bi_end_io = btrfsic_bio_end_io; + } else if ((dev_state->state->print_mask & + (BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH | + BTRFSIC_PRINT_MASK_VERBOSE))) { + pr_info( +"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n", + dev_state->bdev); + } } -int btrfsic_submit_bio_wait(struct bio *bio) +void btrfsic_check_bio(struct bio *bio) { - __btrfsic_submit_bio(bio); - return submit_bio_wait(bio); + struct btrfsic_dev_state *dev_state; + + if (!btrfsic_is_initialized) + return; + + /* + * We can be called before btrfsic_mount, so there might not be a + * dev_state. + */ + dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); + mutex_lock(&btrfsic_mutex); + if (dev_state) { + if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio)) + btrfsic_check_write_bio(bio, dev_state); + else if (bio->bi_opf & REQ_PREFLUSH) + btrfsic_check_flush_bio(bio, dev_state); + } + mutex_unlock(&btrfsic_mutex); } int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/check-integrity.h b/fs/btrfs/check-integrity.h index bcc730a06cb5..e4c8aed7996f 100644 --- a/fs/btrfs/check-integrity.h +++ b/fs/btrfs/check-integrity.h @@ -7,11 +7,9 @@ #define BTRFS_CHECK_INTEGRITY_H #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY -void btrfsic_submit_bio(struct bio *bio); -int btrfsic_submit_bio_wait(struct bio *bio); +void btrfsic_check_bio(struct bio *bio); #else -#define btrfsic_submit_bio submit_bio -#define btrfsic_submit_bio_wait submit_bio_wait +static inline void btrfsic_check_bio(struct bio *bio) { } #endif int btrfsic_mount(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 19bf36d8ffea..e84d22c5c6a8 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -136,109 +136,14 @@ static int compression_decompress(int type, struct list_head *ws, static int btrfs_decompress_bio(struct compressed_bio *cb); -static inline int compressed_bio_size(struct btrfs_fs_info *fs_info, - unsigned long disk_size) -{ - return sizeof(struct compressed_bio) + - (DIV_ROUND_UP(disk_size, fs_info->sectorsize)) * fs_info->csum_size; -} - -static int check_compressed_csum(struct btrfs_inode *inode, struct bio *bio, - u64 disk_start) -{ - struct btrfs_fs_info *fs_info = inode->root->fs_info; - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - const u32 csum_size = fs_info->csum_size; - const u32 sectorsize = fs_info->sectorsize; - struct page *page; - unsigned int i; - char *kaddr; - u8 csum[BTRFS_CSUM_SIZE]; - struct compressed_bio *cb = bio->bi_private; - u8 *cb_sum = cb->sums; - - if ((inode->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state)) - return 0; - - shash->tfm = fs_info->csum_shash; - - for (i = 0; i < cb->nr_pages; i++) { - u32 pg_offset; - u32 bytes_left = PAGE_SIZE; - page = cb->compressed_pages[i]; - - /* Determine the remaining bytes inside the page first */ - if (i == cb->nr_pages - 1) - bytes_left = cb->compressed_len - i * PAGE_SIZE; - - /* Hash through the page sector by sector */ - for (pg_offset = 0; pg_offset < bytes_left; - pg_offset += sectorsize) { - kaddr = kmap_atomic(page); - crypto_shash_digest(shash, kaddr + pg_offset, - sectorsize, csum); - kunmap_atomic(kaddr); - - if (memcmp(&csum, cb_sum, csum_size) != 0) { - btrfs_print_data_csum_error(inode, disk_start, - csum, cb_sum, cb->mirror_num); - if (btrfs_bio(bio)->device) - btrfs_dev_stat_inc_and_print( - btrfs_bio(bio)->device, - BTRFS_DEV_STAT_CORRUPTION_ERRS); - return -EIO; - } - cb_sum += csum_size; - disk_start += sectorsize; - } - } - return 0; -} - -/* - * Reduce bio and io accounting for a compressed_bio with its corresponding bio. - * - * Return true if there is no pending bio nor io. - * Return false otherwise. - */ -static bool dec_and_test_compressed_bio(struct compressed_bio *cb, struct bio *bio) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - unsigned int bi_size = 0; - bool last_io = false; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * At endio time, bi_iter.bi_size doesn't represent the real bio size. - * Thus here we have to iterate through all segments to grab correct - * bio size. - */ - bio_for_each_segment_all(bvec, bio, iter_all) - bi_size += bvec->bv_len; - - if (bio->bi_status) - cb->status = bio->bi_status; - - ASSERT(bi_size && bi_size <= cb->compressed_len); - last_io = refcount_sub_and_test(bi_size >> fs_info->sectorsize_bits, - &cb->pending_sectors); - /* - * Here we must wake up the possible error handler after all other - * operations on @cb finished, or we can race with - * finish_compressed_bio_*() which may free @cb. - */ - wake_up_var(cb); - - return last_io; -} - static void finish_compressed_bio_read(struct compressed_bio *cb) { unsigned int index; struct page *page; + if (cb->status == BLK_STS_OK) + cb->status = errno_to_blk_status(btrfs_decompress_bio(cb)); + /* Release the compressed pages */ for (index = 0; index < cb->nr_pages; index++) { page = cb->compressed_pages[index]; @@ -247,85 +152,63 @@ static void finish_compressed_bio_read(struct compressed_bio *cb) } /* Do io completion on the original bio */ - if (cb->status != BLK_STS_OK) { + if (cb->status != BLK_STS_OK) cb->orig_bio->bi_status = cb->status; - bio_endio(cb->orig_bio); - } else { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; - - /* - * We have verified the checksum already, set page checked so - * the end_io handlers know about it - */ - ASSERT(!bio_flagged(cb->orig_bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, cb->orig_bio, iter_all) { - u64 bvec_start = page_offset(bvec->bv_page) + - bvec->bv_offset; - - btrfs_page_set_checked(btrfs_sb(cb->inode->i_sb), - bvec->bv_page, bvec_start, - bvec->bv_len); - } - - bio_endio(cb->orig_bio); - } + bio_endio(cb->orig_bio); /* Finally free the cb struct */ kfree(cb->compressed_pages); kfree(cb); } -/* when we finish reading compressed pages from the disk, we - * decompress them and then run the bio end_io routines on the - * decompressed pages (in the inode address space). - * - * This allows the checksumming and other IO error handling routines - * to work normally - * - * The compressed pages are freed here, and it must be run - * in process context +/* + * Verify the checksums and kick off repair if needed on the uncompressed data + * before decompressing it into the original bio and freeing the uncompressed + * pages. */ static void end_compressed_bio_read(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - struct inode *inode; - unsigned int mirror = btrfs_bio(bio)->mirror_num; - int ret = 0; - - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; - - /* - * Record the correct mirror_num in cb->orig_bio so that - * read-repair can work properly. - */ - btrfs_bio(cb->orig_bio)->mirror_num = mirror; - cb->mirror_num = mirror; - - /* - * Some IO in this cb have failed, just skip checksum as there - * is no way it could be correct. - */ - if (cb->status != BLK_STS_OK) - goto csum_failed; - - inode = cb->inode; - ret = check_compressed_csum(BTRFS_I(inode), bio, - bio->bi_iter.bi_sector << 9); - if (ret) - goto csum_failed; + struct inode *inode = cb->inode; + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *bi = BTRFS_I(inode); + bool csum = !(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); + blk_status_t status = bio->bi_status; + struct btrfs_bio *bbio = btrfs_bio(bio); + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (!status && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, + bv.bv_page, bv.bv_offset))) { + clean_io_failure(fs_info, &bi->io_failure_tree, + &bi->io_tree, start, bv.bv_page, + btrfs_ino(bi), bv.bv_offset); + } else { + int ret; + + refcount_inc(&cb->pending_ios); + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + btrfs_submit_data_read_bio); + if (ret) { + refcount_dec(&cb->pending_ios); + status = errno_to_blk_status(ret); + } + } + } - /* ok, we're the last bio for this extent, lets start - * the decompression. - */ - ret = btrfs_decompress_bio(cb); + if (status) + cb->status = status; -csum_failed: - if (ret) - cb->status = errno_to_blk_status(ret); - finish_compressed_bio_read(cb); -out: + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + btrfs_bio_free_csum(bbio); bio_put(bio); } @@ -403,6 +286,14 @@ static void finish_compressed_bio_write(struct compressed_bio *cb) kfree(cb); } +static void btrfs_finish_compressed_write_work(struct work_struct *work) +{ + struct compressed_bio *cb = + container_of(work, struct compressed_bio, write_end_work); + + finish_compressed_bio_write(cb); +} + /* * Do the cleanup once all the compressed pages hit the disk. This will clear * writeback on the file pages and free the compressed pages. @@ -414,30 +305,18 @@ static void end_compressed_bio_write(struct bio *bio) { struct compressed_bio *cb = bio->bi_private; - if (!dec_and_test_compressed_bio(cb, bio)) - goto out; + if (bio->bi_status) + cb->status = bio->bi_status; - btrfs_record_physical_zoned(cb->inode, cb->start, bio); + if (refcount_dec_and_test(&cb->pending_ios)) { + struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); - finish_compressed_bio_write(cb); -out: + btrfs_record_physical_zoned(cb->inode, cb->start, bio); + queue_work(fs_info->compressed_write_workers, &cb->write_end_work); + } bio_put(bio); } -static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, - struct compressed_bio *cb, - struct bio *bio, int mirror_num) -{ - blk_status_t ret; - - ASSERT(bio->bi_iter.bi_size); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - return ret; -} - /* * Allocate a compressed_bio, which will be used to read/write on-disk * (aka, compressed) * data. @@ -456,7 +335,7 @@ static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info, static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_bytenr, - unsigned int opf, bio_end_io_t endio_func, + blk_opf_t opf, bio_end_io_t endio_func, u64 *next_stripe_start) { struct btrfs_fs_info *fs_info = btrfs_sb(cb->inode->i_sb); @@ -488,7 +367,7 @@ static struct bio *alloc_compressed_bio(struct compressed_bio *cb, u64 disk_byte return ERR_PTR(ret); } *next_stripe_start = disk_bytenr + geom.len; - + refcount_inc(&cb->pending_ios); return bio; } @@ -506,7 +385,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, + blk_opf_t write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback) { @@ -515,26 +394,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, struct compressed_bio *cb; u64 cur_disk_bytenr = disk_start; u64 next_stripe_start; - blk_status_t ret; + blk_status_t ret = BLK_STS_OK; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; const bool use_append = btrfs_use_zone_append(inode, disk_start); - const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; + const enum req_op bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) return BLK_STS_RESOURCE; - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = &inode->vfs_inode; cb->start = start; cb->len = len; - cb->mirror_num = 0; cb->compressed_pages = compressed_pages; cb->compressed_len = compressed_len; cb->writeback = writeback; - cb->orig_bio = NULL; + INIT_WORK(&cb->write_end_work, btrfs_finish_compressed_write_work); cb->nr_pages = nr_pages; if (blkcg_css) @@ -555,8 +433,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, &next_stripe_start); if (IS_ERR(bio)) { ret = errno_to_blk_status(PTR_ERR(bio)); - bio = NULL; - goto finish_cb; + break; } if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; @@ -600,44 +477,25 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, if (submit) { if (!skip_sum) { ret = btrfs_csum_one_bio(inode, bio, start, true); - if (ret) - goto finish_cb; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + break; + } } - ret = submit_compressed_bio(fs_info, cb, bio, 0); - if (ret) - goto finish_cb; + ASSERT(bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, bio, 0); bio = NULL; } cond_resched(); } - if (blkcg_css) - kthread_associate_blkcg(NULL); - return 0; - -finish_cb: if (blkcg_css) kthread_associate_blkcg(NULL); - if (bio) { - bio->bi_status = ret; - bio_endio(bio); - } - /* Last byte of @cb is submitted, endio will free @cb */ - if (cur_disk_bytenr == disk_start + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_start + compressed_len - cur_disk_bytenr) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_write(cb); + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_write(cb); return ret; } @@ -766,7 +624,6 @@ static noinline int add_ra_bio_pages(struct inode *inode, int zeros; zeros = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, zeros); - flush_dcache_page(page); } } @@ -802,15 +659,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, * After the compressed pages are read, we copy the bytes into the * bio we were passed and then call the bio end_io calls */ -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map_tree *em_tree; struct compressed_bio *cb; unsigned int compressed_len; - unsigned int nr_pages; - unsigned int pg_index; struct bio *comp_bio = NULL; const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; u64 cur_disk_byte = disk_bytenr; @@ -820,8 +675,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, u64 em_start; struct extent_map *em; blk_status_t ret; - int faili = 0; - u8 *sums; + int ret2; + int i; em_tree = &BTRFS_I(inode)->extent_tree; @@ -839,48 +694,40 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); compressed_len = em->block_len; - cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); + cb = kmalloc(sizeof(struct compressed_bio), GFP_NOFS); if (!cb) { ret = BLK_STS_RESOURCE; goto out; } - refcount_set(&cb->pending_sectors, compressed_len >> fs_info->sectorsize_bits); + refcount_set(&cb->pending_ios, 1); cb->status = BLK_STS_OK; cb->inode = inode; - cb->mirror_num = mirror_num; - sums = cb->sums; cb->start = em->orig_start; em_len = em->len; em_start = em->start; - free_extent_map(em); - em = NULL; - cb->len = bio->bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_compress_type(bio_flags); + cb->compress_type = em->compress_type; cb->orig_bio = bio; - nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_NOFS); + free_extent_map(em); + em = NULL; + + cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE); + cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS); if (!cb->compressed_pages) { ret = BLK_STS_RESOURCE; - goto fail1; + goto fail; } - for (pg_index = 0; pg_index < nr_pages; pg_index++) { - cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS); - if (!cb->compressed_pages[pg_index]) { - faili = pg_index - 1; - ret = BLK_STS_RESOURCE; - goto fail2; - } + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + if (ret2) { + ret = BLK_STS_RESOURCE; + goto fail; } - faili = nr_pages - 1; - cb->nr_pages = nr_pages; add_ra_bio_pages(inode, em_start + em_len, cb); @@ -901,9 +748,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, REQ_OP_READ, end_compressed_bio_read, &next_stripe_start); if (IS_ERR(comp_bio)) { - ret = errno_to_blk_status(PTR_ERR(comp_bio)); - comp_bio = NULL; - goto finish_cb; + cb->status = errno_to_blk_status(PTR_ERR(comp_bio)); + break; } } /* @@ -939,58 +785,50 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, submit = true; if (submit) { - unsigned int nr_sectors; + /* Save the original iter for read repair */ + if (bio_op(comp_bio) == REQ_OP_READ) + btrfs_bio(comp_bio)->iter = comp_bio->bi_iter; - ret = btrfs_lookup_bio_sums(inode, comp_bio, sums); - if (ret) - goto finish_cb; + /* + * Save the initial offset of this chunk, as there + * is no direct correlation between compressed pages and + * the original file offset. The field is only used for + * priting error messages. + */ + btrfs_bio(comp_bio)->file_offset = file_offset; - nr_sectors = DIV_ROUND_UP(comp_bio->bi_iter.bi_size, - fs_info->sectorsize); - sums += fs_info->csum_size * nr_sectors; + ret = btrfs_lookup_bio_sums(inode, comp_bio, NULL); + if (ret) { + comp_bio->bi_status = ret; + bio_endio(comp_bio); + break; + } - ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num); - if (ret) - goto finish_cb; + ASSERT(comp_bio->bi_iter.bi_size); + btrfs_submit_bio(fs_info, comp_bio, mirror_num); comp_bio = NULL; } } - return BLK_STS_OK; -fail2: - while (faili >= 0) { - __free_page(cb->compressed_pages[faili]); - faili--; + if (refcount_dec_and_test(&cb->pending_ios)) + finish_compressed_bio_read(cb); + return; + +fail: + if (cb->compressed_pages) { + for (i = 0; i < cb->nr_pages; i++) { + if (cb->compressed_pages[i]) + __free_page(cb->compressed_pages[i]); + } } kfree(cb->compressed_pages); -fail1: kfree(cb); out: free_extent_map(em); bio->bi_status = ret; bio_endio(bio); - return ret; -finish_cb: - if (comp_bio) { - comp_bio->bi_status = ret; - bio_endio(comp_bio); - } - /* All bytes of @cb is submitted, endio will free @cb */ - if (cur_disk_byte == disk_bytenr + compressed_len) - return ret; - - wait_var_event(cb, refcount_read(&cb->pending_sectors) == - (disk_bytenr + compressed_len - cur_disk_byte) >> - fs_info->sectorsize_bits); - /* - * Even with previous bio ended, we should still have io not yet - * submitted, thus need to finish @cb manually. - */ - ASSERT(refcount_read(&cb->pending_sectors)); - /* Now we are the only one referring @cb, can finish it safely. */ - finish_compressed_bio_read(cb); - return ret; + return; } /* @@ -1489,7 +1327,6 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, ASSERT(copy_start - decompressed < buf_len); memcpy_to_page(bvec.bv_page, bvec.bv_offset, buf + copy_start - decompressed, copy_len); - flush_dcache_page(bvec.bv_page); cur_offset += copy_len; bio_advance(orig_bio, copy_len); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index ac5b20731d2a..1aa02903de69 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -30,8 +30,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 struct compressed_bio { - /* Number of sectors with unfinished IO (unsubmitted or unfinished) */ - refcount_t pending_sectors; + /* Number of outstanding bios */ + refcount_t pending_ios; /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -59,16 +59,12 @@ struct compressed_bio { /* IO errors */ blk_status_t status; - int mirror_num; - /* for reads, this is the bio we are copying the data into */ - struct bio *orig_bio; - - /* - * the start of a variable length array of checksums only - * used by reads - */ - u8 sums[]; + union { + /* For reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + struct work_struct write_end_work; + }; }; static inline unsigned int btrfs_compress_type(unsigned int type_level) @@ -99,11 +95,11 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, unsigned int compressed_len, struct page **compressed_pages, unsigned int nr_pages, - unsigned int write_flags, + blk_opf_t write_flags, struct cgroup_subsys_state *blkcg_css, bool writeback); -blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0eecf98d0abb..6e556031a8f3 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -16,6 +16,7 @@ #include "volumes.h" #include "qgroup.h" #include "tree-mod-log.h" +#include "tree-checker.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, int level = btrfs_header_level(buf); ret = btrfs_set_disk_extent_flags(trans, buf, - new_flags, level, 0); + new_flags, level); if (ret) return ret; } @@ -1390,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level, } /* - * helper function for btrfs_search_slot. The goal is to find a block - * in cache without setting the path to blocking. If we find the block - * we return zero and the path is unchanged. + * Helper function for btrfs_search_slot() and other functions that do a search + * on a btree. The goal is to find a tree block in the cache (the radix tree at + * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read + * its pages from disk. * - * If we can't find the block, we set the path blocking and do some - * reada. -EAGAIN is returned and the search must be repeated. + * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the + * whole btree search, starting again from the current root node. */ static int read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, @@ -1409,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, struct btrfs_key first_key; int ret; int parent_level; + bool unlock_up; + unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]); blocknr = btrfs_node_blockptr(*eb_ret, slot); gen = btrfs_node_ptr_generation(*eb_ret, slot); parent_level = btrfs_header_level(*eb_ret); btrfs_node_key_to_cpu(*eb_ret, &first_key, slot); + /* + * If we need to read an extent buffer from disk and we are holding locks + * on upper level nodes, we unlock all the upper nodes before reading the + * extent buffer, and then return -EAGAIN to the caller as it needs to + * restart the search. We don't release the lock on the current level + * because we need to walk this node to figure out which blocks to read. + */ tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { if (p->reada == READA_FORWARD_ALWAYS) @@ -1436,30 +1447,38 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, return 0; } + if (unlock_up) + btrfs_unlock_up_safe(p, level + 1); + /* now we're allowed to do a blocking uptodate check */ - ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key); + ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key); if (ret) { free_extent_buffer(tmp); btrfs_release_path(p); return -EIO; } - *eb_ret = tmp; - return 0; + if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) { + free_extent_buffer(tmp); + btrfs_release_path(p); + return -EUCLEAN; + } + + if (unlock_up) + ret = -EAGAIN; + + goto out; } - /* - * reduce lock contention at high levels - * of the btree by dropping locks before - * we read. Don't release the lock on the current - * level because we need to walk this node to figure - * out which blocks to read. - */ - btrfs_unlock_up_safe(p, level + 1); + if (unlock_up) { + btrfs_unlock_up_safe(p, level + 1); + ret = -EAGAIN; + } else { + ret = 0; + } if (p->reada != READA_NONE) reada_for_search(fs_info, p, level, slot, key->objectid); - ret = -EAGAIN; tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid, gen, parent_level - 1, &first_key); if (IS_ERR(tmp)) { @@ -1474,9 +1493,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, */ if (!extent_buffer_uptodate(tmp)) ret = -EIO; - free_extent_buffer(tmp); - btrfs_release_path(p); +out: + if (ret == 0) { + *eb_ret = tmp; + } else { + free_extent_buffer(tmp); + btrfs_release_path(p); + } + return ret; } @@ -2279,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, return ret; } +/** + * Search for a valid slot for the given path. + * + * @root: The root node of the tree. + * @key: Will contain a valid item if found. + * @path: The starting point to validate the slot. + * + * Return: 0 if the item is valid + * 1 if not found + * <0 if error. + */ +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path) +{ + while (1) { + int ret; + const int slot = path->slots[0]; + const struct extent_buffer *leaf = path->nodes[0]; + + /* This is where we start walking the path. */ + if (slot >= btrfs_header_nritems(leaf)) { + /* + * If we've reached the last slot in this leaf we need + * to go to the next leaf and reset the path. + */ + ret = btrfs_next_leaf(root, path); + if (ret) + return ret; + continue; + } + /* Store the found, valid item in @key. */ + btrfs_item_key_to_cpu(leaf, key, slot); + break; + } + return 0; +} + /* * adjust the pointers going up the tree, starting at level * making sure the right key of each node is points to 'key'. diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 077c95e9baa5..4db85b9dc7ed 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -107,14 +107,6 @@ struct btrfs_ioctl_encoded_io_args; #define BTRFS_STAT_CURR 0 #define BTRFS_STAT_PREV 1 -/* - * Count how many BTRFS_MAX_EXTENT_SIZE cover the @size - */ -static inline u32 count_max_extents(u64 size) -{ - return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); -} - static inline unsigned long btrfs_chunk_item_size(int num_stripes) { BUG_ON(num_stripes == 0); @@ -230,6 +222,13 @@ struct btrfs_root_backup { #define BTRFS_SUPER_INFO_SIZE 4096 /* + * The reserved space at the beginning of each device. + * It covers the primary super block and leaves space for potential use by other + * tools like bootloaders or to lower potential damage of accidental overwrite. + */ +#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M) + +/* * the super block basically lists the main trees of the FS * it currently lacks any block count etc etc */ @@ -248,8 +247,12 @@ struct btrfs_super_block { __le64 chunk_root; __le64 log_root; - /* this will help find the new super based on the log root */ - __le64 log_root_transid; + /* + * This member has never been utilized since the very beginning, thus + * it's always 0 regardless of kernel version. We always use + * generation + 1 to read log tree root. So here we mark it deprecated. + */ + __le64 __unused_log_root_transid; __le64 total_bytes; __le64 bytes_used; __le64 root_dir_objectid; @@ -635,6 +638,9 @@ enum { /* Indicate we have half completed snapshot deletions pending. */ BTRFS_FS_UNFINISHED_DROPS, + /* Indicate we have to finish a zone to do next allocation. */ + BTRFS_FS_NEED_ZONE_FINISH, + #if BITS_PER_LONG == 32 /* Indicate if we have error/warn message printed on 32bit systems */ BTRFS_FS_32BIT_ERROR, @@ -656,6 +662,18 @@ enum btrfs_exclusive_operation { BTRFS_EXCLOP_SWAP_ACTIVATE, }; +/* Store data about transaction commits, exported via sysfs. */ +struct btrfs_commit_stats { + /* Total number of commits */ + u64 commit_count; + /* The maximum commit duration so far in ns */ + u64 max_commit_dur; + /* The last commit duration in ns */ + u64 last_commit_dur; + /* The total commit duration in ns */ + u64 total_commit_dur; +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -679,9 +697,8 @@ struct btrfs_fs_info { struct radix_tree_root fs_roots_radix; /* block group cache stuff */ - spinlock_t block_group_cache_lock; - u64 first_logical_byte; - struct rb_root block_group_cache_tree; + rwlock_t block_group_cache_lock; + struct rb_root_cached block_group_cache_tree; /* keep track of unallocated space */ atomic64_t free_chunk_space; @@ -848,13 +865,14 @@ struct btrfs_fs_info { * two */ struct btrfs_workqueue *workers; + struct btrfs_workqueue *hipri_workers; struct btrfs_workqueue *delalloc_workers; struct btrfs_workqueue *flush_workers; - struct btrfs_workqueue *endio_workers; - struct btrfs_workqueue *endio_meta_workers; - struct btrfs_workqueue *endio_raid56_workers; - struct btrfs_workqueue *rmw_workers; - struct btrfs_workqueue *endio_meta_write_workers; + struct workqueue_struct *endio_workers; + struct workqueue_struct *endio_meta_workers; + struct workqueue_struct *endio_raid56_workers; + struct workqueue_struct *rmw_workers; + struct workqueue_struct *compressed_write_workers; struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *caching_workers; @@ -946,9 +964,9 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - struct btrfs_workqueue *scrub_workers; - struct btrfs_workqueue *scrub_wr_completion_workers; - struct btrfs_workqueue *scrub_parity_workers; + struct workqueue_struct *scrub_workers; + struct workqueue_struct *scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity_workers; struct btrfs_subpage_info *subpage_info; struct btrfs_discard_ctl discard_ctl; @@ -1032,6 +1050,12 @@ struct btrfs_fs_info { u32 csums_per_leaf; u32 stripesize; + /* + * Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular + * filesystem, on zoned it depends on the device constraints. + */ + u64 max_extent_size; + /* Block groups and devices containing active swapfiles. */ spinlock_t swapfile_pins_lock; struct rb_root swapfile_pins; @@ -1045,11 +1069,10 @@ struct btrfs_fs_info { * Zone size > 0 when in ZONED mode, otherwise it's used for a check * if the mode is enabled */ - union { - u64 zone_size; - u64 zoned; - }; + u64 zone_size; + /* Max size to emit ZONE_APPEND write command */ + u64 max_zone_append_size; struct mutex zoned_meta_io_lock; spinlock_t treelog_bg_lock; u64 treelog_bg; @@ -1066,6 +1089,11 @@ struct btrfs_fs_info { spinlock_t zone_active_bgs_lock; struct list_head zone_active_bgs; + /* Waiters when BTRFS_FS_NEED_ZONE_FINISH is set */ + wait_queue_head_t zone_finish_wait; + + /* Updates are not protected by any lock */ + struct btrfs_commit_stats commit_stats; #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; @@ -1331,6 +1359,8 @@ struct btrfs_replace_extent_info { * existing extent into a file range. */ bool is_new_extent; + /* Indicate if we should update the inode's mtime and ctime. */ + bool update_times; /* Meaningful only if is_new_extent is true. */ int qgroup_reserved; /* @@ -2476,8 +2506,6 @@ BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, chunk_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, log_root, 64); -BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, - log_root_transid, 64); BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, log_root_level, 8); BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, @@ -2734,8 +2762,16 @@ int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb, enum btrfs_inline_ref_type is_data); u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset); +static inline u8 *btrfs_csum_ptr(const struct btrfs_fs_info *fs_info, u8 *csums, + u64 offset) +{ + u64 offset_in_sectors = offset >> fs_info->sectorsize_bits; + + return csums + offset_in_sectors * fs_info->csum_size; +} + /* - * Take the number of bytes to be checksummmed and figure out how many leaves + * Take the number of bytes to be checksummed and figure out how many leaves * it would require to store the csums for that many bytes. */ static inline u64 btrfs_csum_bytes_to_leaves( @@ -2784,7 +2820,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes); int btrfs_exclude_logged_extents(struct extent_buffer *eb); int btrfs_cross_ref_exist(struct btrfs_root *root, - u64 objectid, u64 offset, u64 bytenr, bool strict); + u64 objectid, u64 offset, u64 bytenr, bool strict, + struct btrfs_path *path); struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 parent, u64 root_objectid, @@ -2811,8 +2848,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, int full_backref); int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, - struct extent_buffer *eb, u64 flags, - int level, int is_data); + struct extent_buffer *eb, u64 flags, int level); int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, @@ -2892,7 +2928,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root, void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes); + u64 disk_num_bytes, bool noflush); u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo); int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); @@ -3039,6 +3075,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *path); +int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key, + struct btrfs_path *path); + +/* + * Search in @root for a given @key, and store the slot found in @found_key. + * + * @root: The root node of the tree. + * @key: The key we are looking for. + * @found_key: Will hold the found item. + * @path: Holds the current slot/leaf. + * @iter_ret: Contains the value returned from btrfs_search_slot or + * btrfs_get_next_valid_item, whichever was executed last. + * + * The @iter_ret is an output variable that will contain the return value of + * btrfs_search_slot, if it encountered an error, or the value returned from + * btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid + * slot was found, 1 if there were no more leaves, and <0 if there was an error. + * + * It's recommended to use a separate variable for iter_ret and then use it to + * set the function return value so there's no confusion of the 0/1/errno + * values stemming from btrfs_search_slot. + */ +#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \ + for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \ + (iter_ret) >= 0 && \ + (iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \ + (path)->slots[0]++ \ + ) + static inline int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *p, u64 time_seq) { @@ -3190,7 +3255,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset); /* file-item.c */ -struct btrfs_dio_private; int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst); @@ -3224,11 +3288,18 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num); +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 bio_offset, struct page *page, u64 start, u64 end); +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3255,21 +3326,38 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns); +struct btrfs_new_inode_args { + /* Input */ + struct inode *dir; + struct dentry *dentry; + struct inode *inode; + bool orphan; + bool subvol; + + /* + * Output from btrfs_new_inode_prepare(), input to + * btrfs_create_new_inode(). + */ + struct posix_acl *default_acl; + struct posix_acl *acl; +}; +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items); +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args); +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args); +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir); void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits); + u32 bits); void btrfs_clear_delalloc_extent(struct inode *inode, - struct extent_state *state, unsigned *bits); + struct extent_state *state, u32 bits); void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other); void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split); void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end); vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf); -int btrfs_readpage(struct file *file, struct page *page); void btrfs_evict_inode(struct inode *inode); int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc); struct inode *btrfs_alloc_inode(struct super_block *sb); @@ -3309,14 +3397,20 @@ int btrfs_writepage_cow_fixup(struct page *page); void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate); +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type); +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, + struct page **pages); ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, struct btrfs_ioctl_encoded_io_args *encoded); ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, const struct btrfs_ioctl_encoded_io_args *encoded); +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before); + extern const struct dentry_operations btrfs_dentry_operations; -extern const struct iomap_ops btrfs_dio_iomap_ops; -extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) @@ -3328,6 +3422,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes, const u64 del_bytes); +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3403,11 +3498,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { } -#ifdef CONFIG_PRINTK +#ifdef CONFIG_PRINTK_INDEX + +#define btrfs_printk(fs_info, fmt, args...) \ +do { \ + printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \ + _btrfs_printk(fs_info, fmt, ##args); \ +} while (0) + +__printf(2, 3) +__cold +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + +#elif defined(CONFIG_PRINTK) + +#define btrfs_printk(fs_info, fmt, args...) \ + _btrfs_printk(fs_info, fmt, ##args) + __printf(2, 3) __cold -void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); +void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...); + #else + #define btrfs_printk(fs_info, fmt, args...) \ btrfs_no_printk(fs_info, fmt, ##args) #endif @@ -3658,12 +3771,25 @@ do { \ __LINE__, (errno)); \ } while (0) +#ifdef CONFIG_PRINTK_INDEX + #define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ -do { \ - __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ - (errno), fmt, ##args); \ +do { \ + printk_index_subsys_emit( \ + "BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \ + KERN_CRIT, fmt); \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args); \ } while (0) +#else + +#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \ + __btrfs_handle_fs_error((fs_info), __func__, __LINE__, \ + (errno), fmt, ##args) + +#endif + #define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \ &(fs_info)->fs_state))) #define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \ @@ -3816,15 +3942,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu); int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode, struct posix_acl *acl, int type); -int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir); +int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode, + struct posix_acl *acl, int type); #else #define btrfs_get_acl NULL #define btrfs_set_acl NULL -static inline int btrfs_init_acl(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir) +static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, + int type) { - return 0; + return -EOPNOTSUPP; } #endif @@ -3929,7 +4056,20 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info) { - return fs_info->zoned != 0; + return fs_info->zone_size > 0; +} + +/* + * Count how many fs_info->max_extent_size cover the @size + */ +static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (!fs_info) + return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE); +#endif + + return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index bd8267c4687d..1e8f17ff829e 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -273,7 +273,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 disk_num_bytes, u64 *meta_reserve, u64 *qgroup_reserve) { - u64 nr_extents = count_max_extents(num_bytes); + u64 nr_extents = count_max_extents(fs_info, num_bytes); u64 csum_leaves = btrfs_csum_bytes_to_leaves(fs_info, disk_num_bytes); u64 inode_update = btrfs_calc_metadata_size(fs_info, 1); @@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info, } int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, - u64 disk_num_bytes) + u64 disk_num_bytes, bool noflush) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * If we have a transaction open (can happen if we call truncate_block * from truncate), then we need FLUSH_LIMIT so we don't deadlock. */ - if (btrfs_is_free_space_inode(inode)) { + if (noflush || btrfs_is_free_space_inode(inode)) { flush = BTRFS_RESERVE_NO_FLUSH; } else { if (current->journal_info) @@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, */ calc_inode_reservations(fs_info, num_bytes, disk_num_bytes, &meta_reserve, &qgroup_reserve); - ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true, + noflush); if (ret) return ret; ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush); @@ -349,7 +350,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, * needs to free the reservation we just made. */ spin_lock(&inode->lock); - nr_extents = count_max_extents(num_bytes); + nr_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, nr_extents); inode->csum_bytes += disk_num_bytes; btrfs_calculate_inode_block_rsv_size(fs_info, inode); @@ -412,7 +413,7 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) unsigned num_extents; spin_lock(&inode->lock); - num_extents = count_max_extents(num_bytes); + num_extents = count_max_extents(fs_info, num_bytes); btrfs_mod_outstanding_extents(inode, -num_extents); btrfs_calculate_inode_block_rsv_size(fs_info, inode); spin_unlock(&inode->lock); @@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, ret = btrfs_check_data_free_space(inode, reserved, start, len); if (ret < 0) return ret; - ret = btrfs_delalloc_reserve_metadata(inode, len, len); + ret = btrfs_delalloc_reserve_metadata(inode, len, len, false); if (ret < 0) { btrfs_free_reserved_data_space(inode, *reserved, start, len); extent_changeset_free(*reserved); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 748bf6b0d860..e7f34871a132 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -52,18 +52,6 @@ static inline void btrfs_init_delayed_node( INIT_LIST_HEAD(&delayed_node->p_list); } -static inline int btrfs_is_continuous_delayed_item( - struct btrfs_delayed_item *item1, - struct btrfs_delayed_item *item2) -{ - if (item1->key.type == BTRFS_DIR_INDEX_KEY && - item1->key.objectid == item2->key.objectid && - item1->key.type == item2->key.type && - item1->key.offset + 1 == item2->key.offset) - return 1; - return 0; -} - static struct btrfs_delayed_node *btrfs_get_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -398,8 +386,7 @@ static struct btrfs_delayed_item *__btrfs_lookup_delayed_insertion_item( } static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, - struct btrfs_delayed_item *ins, - int action) + struct btrfs_delayed_item *ins) { struct rb_node **p, *node; struct rb_node *parent_node = NULL; @@ -408,9 +395,9 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, int cmp; bool leftmost = true; - if (action == BTRFS_DELAYED_INSERTION_ITEM) + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM) root = &delayed_node->ins_root; - else if (action == BTRFS_DELAYED_DELETION_ITEM) + else if (ins->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) root = &delayed_node->del_root; else BUG(); @@ -436,32 +423,19 @@ static int __btrfs_add_delayed_item(struct btrfs_delayed_node *delayed_node, rb_link_node(node, parent_node, p); rb_insert_color_cached(node, root, leftmost); ins->delayed_node = delayed_node; - ins->ins_or_del = action; - if (ins->key.type == BTRFS_DIR_INDEX_KEY && - action == BTRFS_DELAYED_INSERTION_ITEM && + /* Delayed items are always for dir index items. */ + ASSERT(ins->key.type == BTRFS_DIR_INDEX_KEY); + + if (ins->ins_or_del == BTRFS_DELAYED_INSERTION_ITEM && ins->key.offset >= delayed_node->index_cnt) - delayed_node->index_cnt = ins->key.offset + 1; + delayed_node->index_cnt = ins->key.offset + 1; delayed_node->count++; atomic_inc(&delayed_node->root->fs_info->delayed_root->items); return 0; } -static int __btrfs_add_delayed_insertion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_INSERTION_ITEM); -} - -static int __btrfs_add_delayed_deletion_item(struct btrfs_delayed_node *node, - struct btrfs_delayed_item *item) -{ - return __btrfs_add_delayed_item(node, item, - BTRFS_DELAYED_DELETION_ITEM); -} - static void finish_one_item(struct btrfs_delayed_root *delayed_root) { int seq = atomic_inc_return(&delayed_root->items_seq); @@ -573,7 +547,13 @@ static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "delayed_item", item->key.objectid, num_bytes, 1); - item->bytes_reserved = num_bytes; + /* + * For insertions we track reserved metadata space by accounting + * for the number of leaves that will be used, based on the delayed + * node's index_items_size field. + */ + if (item->ins_or_del == BTRFS_DELAYED_DELETION_ITEM) + item->bytes_reserved = num_bytes; } return ret; @@ -599,6 +579,21 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, btrfs_block_rsv_release(fs_info, rsv, item->bytes_reserved, NULL); } +static void btrfs_delayed_item_release_leaves(struct btrfs_delayed_node *node, + unsigned int num_leaves) +{ + struct btrfs_fs_info *fs_info = node->root->fs_info; + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, num_leaves); + + /* There are no space reservations during log replay, bail out. */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + return; + + trace_btrfs_space_reservation(fs_info, "delayed_item", node->inode_id, + bytes, 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, bytes, NULL); +} + static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -672,22 +667,53 @@ static void btrfs_delayed_inode_release_metadata(struct btrfs_fs_info *fs_info, } /* - * Insert a single delayed item or a batch of delayed items that have consecutive - * keys if they exist. + * Insert a single delayed item or a batch of delayed items, as many as possible + * that fit in a leaf. The delayed items (dir index keys) are sorted by their key + * in the rbtree, and if there's a gap between two consecutive dir index items, + * then it means at some point we had delayed dir indexes to add but they got + * removed (by btrfs_delete_delayed_dir_index()) before we attempted to flush them + * into the subvolume tree. Dir index keys also have their offsets coming from a + * monotonically increasing counter, so we can't get new keys with an offset that + * fits within a gap between delayed dir index items. */ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, struct btrfs_delayed_item *first_item) { + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_delayed_node *node = first_item->delayed_node; LIST_HEAD(item_list); struct btrfs_delayed_item *curr; struct btrfs_delayed_item *next; - const int max_size = BTRFS_LEAF_DATA_SIZE(root->fs_info); + const int max_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_item_batch batch; int total_size; char *ins_data = NULL; int ret; + bool continuous_keys_only = false; + + lockdep_assert_held(&node->mutex); + + /* + * During normal operation the delayed index offset is continuously + * increasing, so we can batch insert all items as there will not be any + * overlapping keys in the tree. + * + * The exception to this is log replay, where we may have interleaved + * offsets in the tree, so our batch needs to be continuous keys only in + * order to ensure we do not end up with out of order items in our leaf. + */ + if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) + continuous_keys_only = true; + + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(first_item->bytes_reserved == 0); list_add_tail(&first_item->tree_list, &item_list); batch.total_data_size = first_item->data_len; @@ -699,9 +725,19 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, int next_size; next = __btrfs_next_delayed_item(curr); - if (!next || !btrfs_is_continuous_delayed_item(curr, next)) + if (!next) + break; + + /* + * We cannot allow gaps in the key space if we're doing log + * replay. + */ + if (continuous_keys_only && + (next->key.offset != curr->key.offset + 1)) break; + ASSERT(next->bytes_reserved == 0); + next_size = next->data_len + sizeof(struct btrfs_item); if (total_size + next_size > max_size) break; @@ -758,9 +794,41 @@ static int btrfs_insert_delayed_item(struct btrfs_trans_handle *trans, */ btrfs_release_path(path); + ASSERT(node->index_item_leaves > 0); + + /* + * For normal operations we will batch an entire leaf's worth of delayed + * items, so if there are more items to process we can decrement + * index_item_leaves by 1 as we inserted 1 leaf's worth of items. + * + * However for log replay we may not have inserted an entire leaf's + * worth of items, we may have not had continuous items, so decrementing + * here would mess up the index_item_leaves accounting. For this case + * only clean up the accounting when there are no items left. + */ + if (next && !continuous_keys_only) { + /* + * We inserted one batch of items into a leaf a there are more + * items to flush in a future batch, now release one unit of + * metadata space from the delayed block reserve, corresponding + * the leaf we just flushed to. + */ + btrfs_delayed_item_release_leaves(node, 1); + node->index_item_leaves--; + } else if (!next) { + /* + * There are no more items to insert. We can have a number of + * reserved leaves > 1 here - this happens when many dir index + * items are added and then removed before they are flushed (file + * names with a very short life, never span a transaction). So + * release all remaining leaves. + */ + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + list_for_each_entry_safe(curr, next, &item_list, tree_list) { list_del(&curr->tree_list); - btrfs_delayed_item_release_metadata(root, curr); btrfs_release_delayed_item(curr); } out: @@ -796,62 +864,75 @@ static int btrfs_batch_delete_items(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_delayed_item *item) { + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_delayed_item *curr, *next; - struct extent_buffer *leaf; - struct btrfs_key key; - struct list_head head; - int nitems, i, last_item; - int ret = 0; + struct extent_buffer *leaf = path->nodes[0]; + LIST_HEAD(batch_list); + int nitems, slot, last_slot; + int ret; + u64 total_reserved_size = item->bytes_reserved; - BUG_ON(!path->nodes[0]); + ASSERT(leaf != NULL); - leaf = path->nodes[0]; + slot = path->slots[0]; + last_slot = btrfs_header_nritems(leaf) - 1; + /* + * Our caller always gives us a path pointing to an existing item, so + * this can not happen. + */ + ASSERT(slot <= last_slot); + if (WARN_ON(slot > last_slot)) + return -ENOENT; - i = path->slots[0]; - last_item = btrfs_header_nritems(leaf) - 1; - if (i > last_item) - return -ENOENT; /* FIXME: Is errno suitable? */ + nitems = 1; + curr = item; + list_add_tail(&curr->tree_list, &batch_list); - next = item; - INIT_LIST_HEAD(&head); - btrfs_item_key_to_cpu(leaf, &key, i); - nitems = 0; /* - * count the number of the dir index items that we can delete in batch + * Keep checking if the next delayed item matches the next item in the + * leaf - if so, we can add it to the batch of items to delete from the + * leaf. */ - while (btrfs_comp_cpu_keys(&next->key, &key) == 0) { - list_add_tail(&next->tree_list, &head); - nitems++; + while (slot < last_slot) { + struct btrfs_key key; - curr = next; next = __btrfs_next_delayed_item(curr); if (!next) break; - if (!btrfs_is_continuous_delayed_item(curr, next)) - break; - - i++; - if (i > last_item) + slot++; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_comp_cpu_keys(&next->key, &key) != 0) break; - btrfs_item_key_to_cpu(leaf, &key, i); + nitems++; + curr = next; + list_add_tail(&curr->tree_list, &batch_list); + total_reserved_size += curr->bytes_reserved; } - if (!nitems) - return 0; - ret = btrfs_del_items(trans, root, path, path->slots[0], nitems); if (ret) - goto out; + return ret; + + /* In case of BTRFS_FS_LOG_RECOVERING items won't have reserved space */ + if (total_reserved_size > 0) { + /* + * Check btrfs_delayed_item_reserve_metadata() to see why we + * don't need to release/reserve qgroup space. + */ + trace_btrfs_space_reservation(fs_info, "delayed_item", + item->key.objectid, total_reserved_size, + 0); + btrfs_block_rsv_release(fs_info, &fs_info->delayed_block_rsv, + total_reserved_size, NULL); + } - list_for_each_entry_safe(curr, next, &head, tree_list) { - btrfs_delayed_item_release_metadata(root, curr); + list_for_each_entry_safe(curr, next, &batch_list, tree_list) { list_del(&curr->tree_list); btrfs_release_delayed_item(curr); } -out: - return ret; + return 0; } static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, @@ -859,43 +940,52 @@ static int btrfs_delete_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_delayed_node *node) { - struct btrfs_delayed_item *curr, *prev; int ret = 0; -do_again: - mutex_lock(&node->mutex); - curr = __btrfs_first_delayed_deletion_item(node); - if (!curr) - goto delete_fail; + while (ret == 0) { + struct btrfs_delayed_item *item; + + mutex_lock(&node->mutex); + item = __btrfs_first_delayed_deletion_item(node); + if (!item) { + mutex_unlock(&node->mutex); + break; + } + + ret = btrfs_search_slot(trans, root, &item->key, path, -1, 1); + if (ret > 0) { + /* + * There's no matching item in the leaf. This means we + * have already deleted this item in a past run of the + * delayed items. We ignore errors when running delayed + * items from an async context, through a work queue job + * running btrfs_async_run_delayed_root(), and don't + * release delayed items that failed to complete. This + * is because we will retry later, and at transaction + * commit time we always run delayed items and will + * then deal with errors if they fail to run again. + * + * So just release delayed items for which we can't find + * an item in the tree, and move to the next item. + */ + btrfs_release_path(path); + btrfs_release_delayed_item(item); + ret = 0; + } else if (ret == 0) { + ret = btrfs_batch_delete_items(trans, root, path, item); + btrfs_release_path(path); + } - ret = btrfs_search_slot(trans, root, &curr->key, path, -1, 1); - if (ret < 0) - goto delete_fail; - else if (ret > 0) { /* - * can't find the item which the node points to, so this node - * is invalid, just drop it. + * We unlock and relock on each iteration, this is to prevent + * blocking other tasks for too long while we are being run from + * the async context (work queue job). Those tasks are typically + * running system calls like creat/mkdir/rename/unlink/etc which + * need to add delayed items to this delayed node. */ - prev = curr; - curr = __btrfs_next_delayed_item(prev); - btrfs_release_delayed_item(prev); - ret = 0; - btrfs_release_path(path); - if (curr) { - mutex_unlock(&node->mutex); - goto do_again; - } else - goto delete_fail; + mutex_unlock(&node->mutex); } - btrfs_batch_delete_items(trans, root, path, curr); - btrfs_release_path(path); - mutex_unlock(&node->mutex); - goto do_again; - -delete_fail: - btrfs_release_path(path); - mutex_unlock(&node->mutex); return ret; } @@ -1354,9 +1444,13 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, struct btrfs_disk_key *disk_key, u8 type, u64 index) { + struct btrfs_fs_info *fs_info = trans->fs_info; + const unsigned int leaf_data_size = BTRFS_LEAF_DATA_SIZE(fs_info); struct btrfs_delayed_node *delayed_node; struct btrfs_delayed_item *delayed_item; struct btrfs_dir_item *dir_item; + bool reserve_leaf_space; + u32 data_len; int ret; delayed_node = btrfs_get_or_create_delayed_node(dir); @@ -1372,6 +1466,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, delayed_item->key.objectid = btrfs_ino(dir); delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; + delayed_item->ins_or_del = BTRFS_DELAYED_INSERTION_ITEM; dir_item = (struct btrfs_dir_item *)delayed_item->data; dir_item->location = *disk_key; @@ -1381,15 +1476,52 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, btrfs_set_stack_dir_type(dir_item, type); memcpy((char *)(dir_item + 1), name, name_len); - ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, delayed_item); - /* - * we have reserved enough space when we start a new transaction, - * so reserving metadata failure is impossible - */ - BUG_ON(ret); + data_len = delayed_item->data_len + sizeof(struct btrfs_item); mutex_lock(&delayed_node->mutex); - ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item); + + if (delayed_node->index_item_leaves == 0 || + delayed_node->curr_index_batch_size + data_len > leaf_data_size) { + delayed_node->curr_index_batch_size = data_len; + reserve_leaf_space = true; + } else { + delayed_node->curr_index_batch_size += data_len; + reserve_leaf_space = false; + } + + if (reserve_leaf_space) { + ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, + delayed_item); + /* + * Space was reserved for a dir index item insertion when we + * started the transaction, so getting a failure here should be + * impossible. + */ + if (WARN_ON(ret)) { + mutex_unlock(&delayed_node->mutex); + btrfs_release_delayed_item(delayed_item); + goto release_node; + } + + delayed_node->index_item_leaves++; + } else if (!test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) { + const u64 bytes = btrfs_calc_insert_metadata_size(fs_info, 1); + + /* + * Adding the new dir index item does not require touching another + * leaf, so we can release 1 unit of metadata that was previously + * reserved when starting the transaction. This applies only to + * the case where we had a transaction start and excludes the + * transaction join case (when replaying log trees). + */ + trace_btrfs_space_reservation(fs_info, "transaction", + trans->transid, bytes, 0); + btrfs_block_rsv_release(fs_info, trans->block_rsv, bytes, NULL); + ASSERT(trans->bytes_reserved >= bytes); + trans->bytes_reserved -= bytes; + } + + ret = __btrfs_add_delayed_item(delayed_node, delayed_item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(name: %.*s) into the insertion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1417,8 +1549,37 @@ static int btrfs_delete_delayed_insertion_item(struct btrfs_fs_info *fs_info, return 1; } - btrfs_delayed_item_release_metadata(node->root, item); + /* + * For delayed items to insert, we track reserved metadata bytes based + * on the number of leaves that we will use. + * See btrfs_insert_delayed_dir_index() and + * btrfs_delayed_item_reserve_metadata()). + */ + ASSERT(item->bytes_reserved == 0); + ASSERT(node->index_item_leaves > 0); + + /* + * If there's only one leaf reserved, we can decrement this item from the + * current batch, otherwise we can not because we don't know which leaf + * it belongs to. With the current limit on delayed items, we rarely + * accumulate enough dir index items to fill more than one leaf (even + * when using a leaf size of 4K). + */ + if (node->index_item_leaves == 1) { + const u32 data_len = item->data_len + sizeof(struct btrfs_item); + + ASSERT(node->curr_index_batch_size >= data_len); + node->curr_index_batch_size -= data_len; + } + btrfs_release_delayed_item(item); + + /* If we now have no more dir index items, we can release all leaves. */ + if (RB_EMPTY_ROOT(&node->ins_root.rb_root)) { + btrfs_delayed_item_release_leaves(node, node->index_item_leaves); + node->index_item_leaves = 0; + } + mutex_unlock(&node->mutex); return 0; } @@ -1451,6 +1612,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } item->key = item_key; + item->ins_or_del = BTRFS_DELAYED_DELETION_ITEM; ret = btrfs_delayed_item_reserve_metadata(trans, dir->root, item); /* @@ -1465,7 +1627,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, } mutex_lock(&node->mutex); - ret = __btrfs_add_delayed_deletion_item(node, item); + ret = __btrfs_add_delayed_item(node, item); if (unlikely(ret)) { btrfs_err(trans->fs_info, "err add delayed dir index item(index: %llu) into the deletion tree of the delayed node(root id: %llu, inode id: %llu, errno: %d)", @@ -1833,12 +1995,17 @@ static void __btrfs_kill_delayed_node(struct btrfs_delayed_node *delayed_node) mutex_lock(&delayed_node->mutex); curr_item = __btrfs_first_delayed_insertion_item(delayed_node); while (curr_item) { - btrfs_delayed_item_release_metadata(root, curr_item); prev_item = curr_item; curr_item = __btrfs_next_delayed_item(prev_item); btrfs_release_delayed_item(prev_item); } + if (delayed_node->index_item_leaves > 0) { + btrfs_delayed_item_release_leaves(delayed_node, + delayed_node->index_item_leaves); + delayed_node->index_item_leaves = 0; + } + curr_item = __btrfs_first_delayed_deletion_item(delayed_node); while (curr_item) { btrfs_delayed_item_release_metadata(root, curr_item); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index b2412160c5bc..9795dc295a18 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -58,6 +58,17 @@ struct btrfs_delayed_node { u64 index_cnt; unsigned long flags; int count; + /* + * The size of the next batch of dir index items to insert (if this + * node is from a directory inode). Protected by @mutex. + */ + u32 curr_index_batch_size; + /* + * Number of leaves reserved for inserting dir index items (if this + * node belongs to a directory inode). This may be larger then the + * actual number of leaves we end up using. Protected by @mutex. + */ + u32 index_item_leaves; }; struct btrfs_delayed_item { diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 4176df149d04..36a3debe9493 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -132,7 +132,7 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans) spin_lock(&delayed_rsv->lock); delayed_rsv->size += num_bytes; - delayed_rsv->full = 0; + delayed_rsv->full = false; spin_unlock(&delayed_rsv->lock); trans->delayed_ref_updates = 0; } @@ -175,7 +175,7 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info, if (num_bytes) delayed_refs_rsv->reserved += num_bytes; if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size) - delayed_refs_rsv->full = 1; + delayed_refs_rsv->full = true; spin_unlock(&delayed_refs_rsv->lock); if (num_bytes) @@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID); ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action); - BUG_ON(extent_op && extent_op->is_data); ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS); if (!ref) return -ENOMEM; @@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans, return -ENOMEM; init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0, - BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data, - false); + BTRFS_UPDATE_DELAYED_HEAD, false, false); head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 91a3aabad150..d6304b690ec4 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op { u8 level; bool update_key; bool update_flags; - bool is_data; u64 flags_to_set; }; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f26202621989..f43196a893ca 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -474,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, struct btrfs_dev_extent *dev_extent = NULL; struct btrfs_block_group *cache; struct btrfs_trans_handle *trans; + int iter_ret = 0; int ret = 0; u64 chunk_offset; @@ -524,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, key.type = BTRFS_DEV_EXTENT_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto free_path; - if (ret > 0) { - if (path->slots[0] >= - btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto free_path; - if (ret > 0) { - ret = 0; - goto free_path; - } - } else { - ret = 0; - } - } - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; - int slot = path->slots[0]; - - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != src_dev->devid) break; @@ -557,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info, if (found_key.offset < key.offset) break; - dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent); + dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent); cache = btrfs_lookup_block_group(fs_info, chunk_offset); if (!cache) - goto skip; + continue; spin_lock(&cache->lock); cache->to_copy = 1; spin_unlock(&cache->lock); btrfs_put_block_group(cache); - -skip: - ret = btrfs_next_item(root, path); - if (ret != 0) { - if (ret > 0) - ret = 0; - break; - } } + if (iter_ret < 0) + ret = iter_ret; -free_path: btrfs_free_path(path); unlock: mutex_unlock(&fs_info->chunk_mutex); @@ -614,7 +587,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, ASSERT(!IS_ERR(em)); map = em->map_lookup; - num_extents = cur_extent = 0; + num_extents = 0; + cur_extent = 0; for (i = 0; i < map->num_stripes; i++) { /* We have more device extent to copy */ if (srcdev != map->stripes[i].dev) @@ -881,6 +855,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *tgt_device; struct btrfs_device *src_device; struct btrfs_root *root = fs_info->tree_root; @@ -930,12 +905,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, WARN_ON(ret); /* Prevent write_all_supers() during the finishing procedure */ - mutex_lock(&fs_info->fs_devices->device_list_mutex); + mutex_lock(&fs_devices->device_list_mutex); /* Prevent new chunks being allocated on the source device */ mutex_lock(&fs_info->chunk_mutex); if (!list_empty(&src_device->post_commit_list)) { - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); mutex_unlock(&fs_info->chunk_mutex); } else { break; @@ -972,7 +947,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, error: up_write(&dev_replace->rwsem); mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); btrfs_rm_dev_replace_blocked(fs_info); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(tgt_device); @@ -1001,8 +976,8 @@ error: btrfs_assign_next_active_device(src_device, tgt_device); - list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); - fs_info->fs_devices->rw_devices++; + list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list); + fs_devices->rw_devices++; up_write(&dev_replace->rwsem); btrfs_rm_dev_replace_blocked(fs_info); @@ -1025,7 +1000,7 @@ error: * belong to this filesystem. */ mutex_unlock(&fs_info->chunk_mutex); - mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&fs_devices->device_list_mutex); /* replace the sysfs entry */ btrfs_sysfs_remove_device(src_device); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index 3b532bab0755..72fb2c518a2b 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path, u64 dirid, const char *name, int name_len) { - struct extent_buffer *leaf; struct btrfs_dir_item *di; struct btrfs_key key; - u32 nritems; int ret; key.objectid = dirid; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - return ERR_PTR(ret); - - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - - while (1) { - if (path->slots[0] >= nritems) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - return ERR_PTR(ret); - if (ret > 0) - break; - leaf = path->nodes[0]; - nritems = btrfs_header_nritems(leaf); - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_for_each_slot(root, &key, &key, path, ret) { if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY) break; @@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root, name, name_len); if (di) return di; - - path->slots[0]++; } - return NULL; + /* Adjust return code if the key was not found in the next leaf. */ + if (ret > 0) + ret = 0; + + return ERR_PTR(ret); } struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 84795d831282..4c3166f3c725 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -51,7 +51,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, struct btrfs_fs_info *fs_info); @@ -64,40 +63,6 @@ static int btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info); static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info); -/* - * btrfs_end_io_wq structs are used to do processing in task context when an IO - * is complete. This is used during reads to verify checksums, and it is used - * by writes to insert metadata for new file extents after IO is complete. - */ -struct btrfs_end_io_wq { - struct bio *bio; - bio_end_io_t *end_io; - void *private; - struct btrfs_fs_info *info; - blk_status_t status; - enum btrfs_wq_endio_type metadata; - struct btrfs_work work; -}; - -static struct kmem_cache *btrfs_end_io_wq_cache; - -int __init btrfs_end_io_wq_init(void) -{ - btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", - sizeof(struct btrfs_end_io_wq), - 0, - SLAB_MEM_SPREAD, - NULL); - if (!btrfs_end_io_wq_cache) - return -ENOMEM; - return 0; -} - -void __cold btrfs_end_io_wq_exit(void) -{ - kmem_cache_destroy(btrfs_end_io_wq_cache); -} - static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) { if (fs_info->csum_shash) @@ -256,8 +221,8 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, goto out; } btrfs_err_rl(eb->fs_info, - "parent transid verify failed on %llu wanted %llu found %llu", - eb->start, +"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu", + eb->start, eb->read_mirror, parent_transid, btrfs_header_generation(eb)); ret = 1; clear_extent_buffer_uptodate(eb); @@ -374,9 +339,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level, * @level: expected level, mandatory check * @first_key: expected key of first slot, skip check if NULL */ -static int btree_read_extent_buffer_pages(struct extent_buffer *eb, - u64 parent_transid, int level, - struct btrfs_key *first_key) +int btrfs_read_extent_buffer(struct extent_buffer *eb, + u64 parent_transid, int level, + struct btrfs_key *first_key) { struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; @@ -519,7 +484,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec u64 found_start; struct extent_buffer *eb; - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; @@ -587,21 +552,23 @@ static int validate_extent_buffer(struct extent_buffer *eb) found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - btrfs_err_rl(fs_info, "bad tree block start, want %llu have %llu", - eb->start, found_start); + btrfs_err_rl(fs_info, + "bad tree block start, mirror %u want %llu have %llu", + eb->read_mirror, eb->start, found_start); ret = -EIO; goto out; } if (check_tree_block_fsid(eb)) { - btrfs_err_rl(fs_info, "bad fsid on block %llu", - eb->start); + btrfs_err_rl(fs_info, "bad fsid on logical %llu mirror %u", + eb->start, eb->read_mirror); ret = -EIO; goto out; } found_level = btrfs_header_level(eb); if (found_level >= BTRFS_MAX_LEVEL) { - btrfs_err(fs_info, "bad tree block level %d on %llu", - (int)btrfs_header_level(eb), eb->start); + btrfs_err(fs_info, + "bad tree block level, mirror %u level %d on logical %llu", + eb->read_mirror, btrfs_header_level(eb), eb->start); ret = -EIO; goto out; } @@ -612,8 +579,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, - "checksum verify failed on %llu wanted " CSUM_FMT " found " CSUM_FMT " level %d", - eb->start, +"checksum verify failed on logical %llu mirror %u wanted " CSUM_FMT " found " CSUM_FMT " level %d", + eb->start, eb->read_mirror, CSUM_FMT_VALUE(csum_size, header_csum), CSUM_FMT_VALUE(csum_size, result), btrfs_header_level(eb)); @@ -638,8 +605,8 @@ static int validate_extent_buffer(struct extent_buffer *eb) set_extent_buffer_uptodate(eb); else btrfs_err(fs_info, - "block=%llu read time tree block corruption detected", - eb->start); + "read time tree block corruption detected on logical %llu mirror %u", + eb->start, eb->read_mirror); out: return ret; } @@ -704,7 +671,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ASSERT(page->private); - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return validate_subpage_buffer(page, start, end, mirror); eb = (struct extent_buffer *)page->private; @@ -740,58 +707,6 @@ err: return ret; } -static void end_workqueue_bio(struct bio *bio) -{ - struct btrfs_end_io_wq *end_io_wq = bio->bi_private; - struct btrfs_fs_info *fs_info; - struct btrfs_workqueue *wq; - - fs_info = end_io_wq->info; - end_io_wq->status = bio->bi_status; - - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_METADATA) - wq = fs_info->endio_meta_write_workers; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_FREE_SPACE) - wq = fs_info->endio_freespace_worker; - else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else - wq = fs_info->endio_write_workers; - } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) - wq = fs_info->endio_raid56_workers; - else if (end_io_wq->metadata) - wq = fs_info->endio_meta_workers; - else - wq = fs_info->endio_workers; - } - - btrfs_init_work(&end_io_wq->work, end_workqueue_fn, NULL, NULL); - btrfs_queue_work(wq, &end_io_wq->work); -} - -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata) -{ - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); - if (!end_io_wq) - return BLK_STS_RESOURCE; - - end_io_wq->private = bio->bi_private; - end_io_wq->end_io = bio->bi_end_io; - end_io_wq->info = info; - end_io_wq->status = 0; - end_io_wq->bio = bio; - end_io_wq->metadata = metadata; - - bio->bi_private = end_io_wq; - bio->bi_end_io = end_workqueue_bio; - return 0; -} - static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async; @@ -816,7 +731,6 @@ static void run_one_async_done(struct btrfs_work *work) { struct async_submit_bio *async; struct inode *inode; - blk_status_t ret; async = container_of(work, struct async_submit_bio, work); inode = async->inode; @@ -834,11 +748,7 @@ static void run_one_async_done(struct btrfs_work *work) * This changes nothing when cgroups aren't in use. */ async->bio->bi_opf |= REQ_CGROUP_PUNT; - ret = btrfs_map_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); - if (ret) { - async->bio->bi_status = ret; - bio_endio(async->bio); - } + btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num); } static void run_one_async_free(struct btrfs_work *work) @@ -849,17 +759,23 @@ static void run_one_async_free(struct btrfs_work *work) kfree(async); } -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start) +/* + * Submit bio to an async queue. + * + * Retrun: + * - true if the work has been succesfuly submitted + * - false in case of error + */ +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct async_submit_bio *async; async = kmalloc(sizeof(*async), GFP_NOFS); if (!async) - return BLK_STS_RESOURCE; + return false; async->inode = inode; async->bio = bio; @@ -874,10 +790,10 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, async->status = 0; if (op_is_sync(bio->bi_opf)) - btrfs_set_work_high_priority(&async->work); - - btrfs_queue_work(fs_info->workers, &async->work); - return 0; + btrfs_queue_work(fs_info->hipri_workers, &async->work); + else + btrfs_queue_work(fs_info->workers, &async->work); + return true; } static blk_status_t btree_csum_one_bio(struct bio *bio) @@ -903,7 +819,7 @@ static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio, { /* * when we're called for a write, we're already in the async - * submission context. Just jump into btrfs_map_bio + * submission context. Just jump into btrfs_submit_bio. */ return btree_csum_one_bio(bio); } @@ -920,69 +836,59 @@ static bool should_async_write(struct btrfs_fs_info *fs_info, return true; } -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); blk_status_t ret; + bio->bi_opf |= REQ_META; + if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - /* - * called for a read, do the setup so that checksum validation - * can happen in the async kernel threads - */ - ret = btrfs_bio_wq_end_io(fs_info, bio, - BTRFS_WQ_ENDIO_METADATA); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else if (!should_async_write(fs_info, BTRFS_I(inode))) { - ret = btree_csum_one_bio(bio); - if (ret) - goto out_w_error; - ret = btrfs_map_bio(fs_info, bio, mirror_num); - } else { - /* - * kthread helpers are used to submit writes so that - * checksumming can happen in parallel across all CPUs - */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0, - 0, btree_submit_bio_start); + btrfs_submit_bio(fs_info, bio, mirror_num); + return; } - if (ret) - goto out_w_error; - return 0; + /* + * Kthread helpers are used to submit writes so that checksumming can + * happen in parallel across all CPUs. + */ + if (should_async_write(fs_info, BTRFS_I(inode)) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start)) + return; -out_w_error: - bio->bi_status = ret; - bio_endio(bio); - return ret; + ret = btree_csum_one_bio(bio); + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + return; + } + + btrfs_submit_bio(fs_info, bio, mirror_num); } #ifdef CONFIG_MIGRATION -static int btree_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, - enum migrate_mode mode) +static int btree_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { /* * we can't safely write a btree page from here, * we haven't done the locking hook */ - if (PageDirty(page)) + if (folio_test_dirty(src)) return -EAGAIN; /* * Buffers may be managed in a filesystem specific way. * We must have no buffers or drop them. */ - if (page_has_private(page) && - !try_to_release_page(page, GFP_KERNEL)) + if (folio_get_private(src) && + !filemap_release_folio(src, GFP_KERNEL)) return -EAGAIN; - return migrate_page(mapping, newpage, page, mode); + return migrate_folio(mapping, dst, src, mode); } +#else +#define btree_migrate_folio NULL #endif - static int btree_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1005,12 +911,12 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_releasepage(struct page *page, gfp_t gfp_flags) +static bool btree_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; - return try_release_extent_buffer(page); + return try_release_extent_buffer(&folio->page); } static void btree_invalidate_folio(struct folio *folio, size_t offset, @@ -1019,7 +925,7 @@ static void btree_invalidate_folio(struct folio *folio, size_t offset, struct extent_io_tree *tree; tree = &BTRFS_I(folio->mapping->host)->io_tree; extent_invalidate_folio(tree, folio, offset); - btree_releasepage(&folio->page, GFP_NOFS); + btree_release_folio(folio, GFP_NOFS); if (folio_get_private(folio)) { btrfs_warn(BTRFS_I(folio->mapping->host)->root->fs_info, "folio private not zero on folio %llu", @@ -1080,12 +986,10 @@ static bool btree_dirty_folio(struct address_space *mapping, static const struct address_space_operations btree_aops = { .writepages = btree_writepages, - .releasepage = btree_releasepage, + .release_folio = btree_release_folio, .invalidate_folio = btree_invalidate_folio, -#ifdef CONFIG_MIGRATION - .migratepage = btree_migratepage, -#endif - .dirty_folio = btree_dirty_folio, + .migrate_folio = btree_migrate_folio, + .dirty_folio = btree_dirty_folio, }; struct extent_buffer *btrfs_find_create_tree_block( @@ -1118,12 +1022,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, if (IS_ERR(buf)) return buf; - ret = btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); + ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key); if (ret) { free_extent_buffer_stale(buf); return ERR_PTR(ret); } + if (btrfs_check_eb_owner(buf, owner_root)) { + free_extent_buffer_stale(buf); + return ERR_PTR(-EUCLEAN); + } return buf; } @@ -1563,6 +1470,23 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root, ret = -EIO; goto fail; } + + /* + * For real fs, and not log/reloc trees, root owner must + * match its root node owner + */ + if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID && + root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID && + root->root_key.objectid != btrfs_header_owner(root->node)) { + btrfs_crit(fs_info, +"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu", + root->root_key.objectid, root->node->start, + btrfs_header_owner(root->node), + root->root_key.objectid); + ret = -EUCLEAN; + goto fail; + } root->commit_root = btrfs_root_node(root); return root; fail: @@ -1860,7 +1784,7 @@ again: fail: /* * If our caller provided us an anonymous device, then it's his - * responsability to free it in case we fail. So we have to set our + * responsibility to free it in case we fail. So we have to set our * root's anon_dev to 0 to avoid a double free, once by btrfs_put_root() * and once again by our caller. */ @@ -1943,28 +1867,9 @@ struct btrfs_root *btrfs_get_fs_root_commit_root(struct btrfs_fs_info *fs_info, return root; } -/* - * called by the kthread helper functions to finally call the bio end_io - * functions. This is where read checksum verification actually happens - */ -static void end_workqueue_fn(struct btrfs_work *work) -{ - struct bio *bio; - struct btrfs_end_io_wq *end_io_wq; - - end_io_wq = container_of(work, struct btrfs_end_io_wq, work); - bio = end_io_wq->bio; - - bio->bi_status = end_io_wq->status; - bio->bi_private = end_io_wq->private; - bio->bi_end_io = end_io_wq->end_io; - bio_endio(bio); - kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); -} - static int cleaner_kthread(void *arg) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg; + struct btrfs_fs_info *fs_info = arg; int again; while (1) { @@ -2266,10 +2171,16 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) { btrfs_destroy_workqueue(fs_info->fixup_workers); btrfs_destroy_workqueue(fs_info->delalloc_workers); + btrfs_destroy_workqueue(fs_info->hipri_workers); btrfs_destroy_workqueue(fs_info->workers); - btrfs_destroy_workqueue(fs_info->endio_workers); - btrfs_destroy_workqueue(fs_info->endio_raid56_workers); - btrfs_destroy_workqueue(fs_info->rmw_workers); + if (fs_info->endio_workers) + destroy_workqueue(fs_info->endio_workers); + if (fs_info->endio_raid56_workers) + destroy_workqueue(fs_info->endio_raid56_workers); + if (fs_info->rmw_workers) + destroy_workqueue(fs_info->rmw_workers); + if (fs_info->compressed_write_workers) + destroy_workqueue(fs_info->compressed_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->delayed_workers); @@ -2283,8 +2194,8 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) * the queues used for metadata I/O, since tasks from those other work * queues can do metadata I/O operations. */ - btrfs_destroy_workqueue(fs_info->endio_meta_workers); - btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); + if (fs_info->endio_meta_workers) + destroy_workqueue(fs_info->endio_meta_workers); } static void free_root_extent_buffers(struct btrfs_root *root) @@ -2414,7 +2325,9 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); - memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); + BTRFS_I(inode)->location.objectid = BTRFS_BTREE_INODE_OBJECTID; + BTRFS_I(inode)->location.type = 0; + BTRFS_I(inode)->location.offset = 0; set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); btrfs_insert_inode_hash(inode); } @@ -2444,7 +2357,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND; fs_info->workers = - btrfs_alloc_workqueue(fs_info, "worker", + btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16); + fs_info->hipri_workers = + btrfs_alloc_workqueue(fs_info, "worker-high", flags | WQ_HIGHPRI, max_active, 16); fs_info->delalloc_workers = @@ -2461,26 +2376,18 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->fixup_workers = btrfs_alloc_workqueue(fs_info, "fixup", flags, 1, 0); - /* - * endios are largely parallel and should have a very - * low idle thresh - */ fs_info->endio_workers = - btrfs_alloc_workqueue(fs_info, "endio", flags, max_active, 4); + alloc_workqueue("btrfs-endio", flags, max_active); fs_info->endio_meta_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta", flags, - max_active, 4); - fs_info->endio_meta_write_workers = - btrfs_alloc_workqueue(fs_info, "endio-meta-write", flags, - max_active, 2); + alloc_workqueue("btrfs-endio-meta", flags, max_active); fs_info->endio_raid56_workers = - btrfs_alloc_workqueue(fs_info, "endio-raid56", flags, - max_active, 4); - fs_info->rmw_workers = - btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2); + alloc_workqueue("btrfs-endio-raid56", flags, max_active); + fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active); fs_info->endio_write_workers = btrfs_alloc_workqueue(fs_info, "endio-write", flags, max_active, 2); + fs_info->compressed_write_workers = + alloc_workqueue("btrfs-compressed-write", flags, max_active); fs_info->endio_freespace_worker = btrfs_alloc_workqueue(fs_info, "freespace-write", flags, max_active, 0); @@ -2492,10 +2399,10 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) fs_info->discard_ctl.discard_workers = alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1); - if (!(fs_info->workers && fs_info->delalloc_workers && - fs_info->flush_workers && + if (!(fs_info->workers && fs_info->hipri_workers && + fs_info->delalloc_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && - fs_info->endio_meta_write_workers && + fs_info->compressed_write_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->fixup_workers && @@ -2522,6 +2429,9 @@ static int btrfs_init_csum_hash(struct btrfs_fs_info *fs_info, u16 csum_type) fs_info->csum_shash = csum_shash; + btrfs_info(fs_info, "using %s (%s) checksum algorithm", + btrfs_super_csum_name(csum_type), + crypto_shash_driver_name(csum_shash)); return 0; } @@ -2815,12 +2725,14 @@ static int validate_super(struct btrfs_fs_info *fs_info, } /* - * For 4K page size, we only support 4K sector size. - * For 64K page size, we support 64K and 4K sector sizes. + * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * + * We can support 16K sectorsize with 64K page size without problem, + * but such sectorsize/pagesize combination doesn't make much sense. + * 4K will be our future standard, PAGE_SIZE is supported from the very + * beginning. */ - if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) || - (PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K && - sectorsize != SZ_64K))) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -3209,9 +3121,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) btrfs_init_balance(fs_info); btrfs_init_async_reclaim_work(fs_info); - spin_lock_init(&fs_info->block_group_cache_lock); - fs_info->block_group_cache_tree = RB_ROOT; - fs_info->first_logical_byte = (u64)-1; + rwlock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree = RB_ROOT_CACHED; extent_io_tree_init(fs_info, &fs_info->excluded_extents, IO_TREE_FS_EXCLUDED_EXTENTS, NULL); @@ -3239,6 +3150,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) init_waitqueue_head(&fs_info->transaction_blocked_wait); init_waitqueue_head(&fs_info->async_submit_wait); init_waitqueue_head(&fs_info->delayed_iputs_wait); + init_waitqueue_head(&fs_info->zone_finish_wait); /* Usable values until the real ones are cached from the superblock */ fs_info->nodesize = 4096; @@ -3246,6 +3158,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; + spin_lock_init(&fs_info->swapfile_pins_lock); fs_info->swapfile_pins = RB_ROOT; @@ -3295,7 +3209,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block static int btrfs_uuid_rescan_kthread(void *data) { - struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data; + struct btrfs_fs_info *fs_info = data; int ret; /* @@ -3577,16 +3491,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - /* - * Flag our filesystem as having big metadata blocks if they are bigger - * than the page size. - */ - if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) { - if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) - btrfs_info(fs_info, - "flagging fs with big metadata feature"); - features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; - } /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); @@ -3611,7 +3515,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_INCOMPAT_SUPP; if (features) { btrfs_err(fs_info, - "cannot mount because of unsupported optional features (%llx)", + "cannot mount because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; @@ -3624,8 +3528,12 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device else if (fs_info->compress_type == BTRFS_COMPRESS_ZSTD) features |= BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD; - if (features & BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA) - btrfs_info(fs_info, "has skinny extents"); + /* + * Flag our filesystem as having big metadata blocks if they are bigger + * than the page size. + */ + if (btrfs_super_nodesize(disk_super) > PAGE_SIZE) + features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; /* * mixed block groups end up with duplicate but slightly offset @@ -3649,11 +3557,25 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device ~BTRFS_FEATURE_COMPAT_RO_SUPP; if (!sb_rdonly(sb) && features) { btrfs_err(fs_info, - "cannot mount read-write because of unsupported optional features (%llx)", + "cannot mount read-write because of unsupported optional features (0x%llx)", features); err = -EINVAL; goto fail_alloc; } + /* + * We have unsupported RO compat features, although RO mounted, we + * should not cause any metadata write, including log replay. + * Or we could screw up whatever the new feature requires. + */ + if (unlikely(features && btrfs_super_log_root(disk_super) && + !btrfs_test_opt(fs_info, NOLOGREPLAY))) { + btrfs_err(fs_info, +"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay", + features); + err = -EINVAL; + goto fail_alloc; + } + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; @@ -3672,14 +3594,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); - if (btrfs_super_incompat_flags(fs_info->super_copy) & - BTRFS_FEATURE_INCOMPAT_RAID56) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - err = -EINVAL; - goto fail_alloc; - } subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL); if (!subpage_info) goto fail_alloc; @@ -4157,7 +4071,8 @@ static int write_dev_supers(struct btrfs_device *device, if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER)) bio->bi_opf |= REQ_FUA; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); if (btrfs_advance_sb_log(device, i)) errors++; @@ -4271,7 +4186,8 @@ static void write_dev_flush(struct btrfs_device *device) init_completion(&device->flush_wait); bio->bi_private = &device->flush_wait; - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state); } @@ -4640,6 +4556,17 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) int ret; set_bit(BTRFS_FS_CLOSING_START, &fs_info->flags); + + /* + * We may have the reclaim task running and relocating a data block group, + * in which case it may create delayed iputs. So stop it before we park + * the cleaner kthread otherwise we can get new delayed iputs after + * parking the cleaner, and that can make the async reclaim task to hang + * if it's waiting for delayed iputs to complete, since the cleaner is + * parked and can not run delayed iputs - this will make us hang when + * trying to stop the async reclaim task. + */ + cancel_work_sync(&fs_info->reclaim_bgs_work); /* * We don't want the cleaner to start new transactions, add more delayed * iputs, etc. while we're closing. We can't use kthread_stop() yet @@ -4680,8 +4607,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); - cancel_work_sync(&fs_info->reclaim_bgs_work); - /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); @@ -4863,13 +4788,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info) __btrfs_btree_balance_dirty(fs_info, 0); } -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key) -{ - return btree_read_extent_buffer_pages(buf, parent_transid, - level, first_key); -} - static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) { /* cleanup FS via transaction */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 2e10514ecda8..8993b428e09c 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -17,13 +17,6 @@ */ #define BTRFS_BDEV_BLOCKSIZE (4096) -enum btrfs_wq_endio_type { - BTRFS_WQ_ENDIO_DATA, - BTRFS_WQ_ENDIO_METADATA, - BTRFS_WQ_ENDIO_FREE_SPACE, - BTRFS_WQ_ENDIO_RAID56, -}; - static inline u64 btrfs_sb_offset(int mirror) { u64 start = SZ_16K; @@ -87,8 +80,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, struct page *page, u64 start, u64 end, int mirror); -blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags); +void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif @@ -120,14 +112,11 @@ void btrfs_put_root(struct btrfs_root *root); void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level, - struct btrfs_key *first_key); -blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - enum btrfs_wq_endio_type metadata); -blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags, - u64 dio_file_offset, - extent_submit_bio_start_t *submit_bio_start); +int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid, + int level, struct btrfs_key *first_key); +bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num, + u64 dio_file_offset, + extent_submit_bio_start_t *submit_bio_start); blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio, int mirror_num); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, @@ -147,8 +136,6 @@ int btree_lock_page_hook(struct page *page, void *data, int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid); int btrfs_init_root_free_objectid(struct btrfs_root *root); -int __init btrfs_end_io_wq_init(void); -void __cold btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_set_buffer_lockdep_class(u64 objectid, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6260784e74b5..ea3ec1e761e8 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -895,7 +895,13 @@ again: err = -ENOENT; while (1) { if (ptr >= end) { - WARN_ON(ptr > end); + if (ptr > end) { + err = -EUCLEAN; + btrfs_print_leaf(path->nodes[0]); + btrfs_crit(fs_info, +"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu", + path->slots[0], root_objectid, owner, offset, parent); + } break; } iref = (struct btrfs_extent_inline_ref *)ptr; @@ -1263,7 +1269,7 @@ static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len, return ret; } -static int do_discard_extent(struct btrfs_io_stripe *stripe, u64 *bytes) +static int do_discard_extent(struct btrfs_discard_stripe *stripe, u64 *bytes) { struct btrfs_device *dev = stripe->dev; struct btrfs_fs_info *fs_info = dev->fs_info; @@ -1310,76 +1316,60 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, u64 discarded_bytes = 0; u64 end = bytenr + num_bytes; u64 cur = bytenr; - struct btrfs_io_context *bioc = NULL; /* - * Avoid races with device replace and make sure our bioc has devices - * associated to its stripes that don't go away while we are discarding. + * Avoid races with device replace and make sure the devices in the + * stripes don't go away while we are discarding. */ btrfs_bio_counter_inc_blocked(fs_info); while (cur < end) { - struct btrfs_io_stripe *stripe; + struct btrfs_discard_stripe *stripes; + unsigned int num_stripes; int i; num_bytes = end - cur; - /* Tell the block device(s) that the sectors can be discarded */ - ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur, - &num_bytes, &bioc, 0); - /* - * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or - * -EOPNOTSUPP. For any such error, @num_bytes is not updated, - * thus we can't continue anyway. - */ - if (ret < 0) - goto out; + stripes = btrfs_map_discard(fs_info, cur, &num_bytes, &num_stripes); + if (IS_ERR(stripes)) { + ret = PTR_ERR(stripes); + if (ret == -EOPNOTSUPP) + ret = 0; + break; + } - stripe = bioc->stripes; - for (i = 0; i < bioc->num_stripes; i++, stripe++) { + for (i = 0; i < num_stripes; i++) { + struct btrfs_discard_stripe *stripe = stripes + i; u64 bytes; - struct btrfs_device *device = stripe->dev; - if (!device->bdev) { + if (!stripe->dev->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } - if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, + &stripe->dev->dev_state)) continue; ret = do_discard_extent(stripe, &bytes); - if (!ret) { - discarded_bytes += bytes; - } else if (ret != -EOPNOTSUPP) { + if (ret) { /* - * Logic errors or -ENOMEM, or -EIO, but - * unlikely to happen. - * - * And since there are two loops, explicitly - * go to out to avoid confusion. + * Keep going if discard is not supported by the + * device. */ - btrfs_put_bioc(bioc); - goto out; + if (ret != -EOPNOTSUPP) + break; + ret = 0; + } else { + discarded_bytes += bytes; } - - /* - * Just in case we get back EOPNOTSUPP for some reason, - * just ignore the return value so we don't screw up - * people calling discard_extent. - */ - ret = 0; } - btrfs_put_bioc(bioc); + kfree(stripes); + if (ret) + break; cur += num_bytes; } -out: btrfs_bio_counter_dec(fs_info); - if (actual_bytes) *actual_bytes = discarded_bytes; - - - if (ret == -EOPNOTSUPP) - ret = 0; return ret; } @@ -1577,12 +1567,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, u32 item_size; int ret; int err = 0; - int metadata = !extent_op->is_data; + int metadata = 1; if (TRANS_ABORTED(trans)) return 0; - if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA)) metadata = 0; path = btrfs_alloc_path(); @@ -2180,7 +2170,7 @@ out: int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, struct extent_buffer *eb, u64 flags, - int level, int is_data) + int level) { struct btrfs_delayed_extent_op *extent_op; int ret; @@ -2192,7 +2182,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_flags = true; extent_op->update_key = false; - extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op); @@ -2357,15 +2346,10 @@ out: } int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, - u64 bytenr, bool strict) + u64 bytenr, bool strict, struct btrfs_path *path) { - struct btrfs_path *path; int ret; - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - do { ret = check_committed_ref(root, path, objectid, offset, bytenr, strict); @@ -2376,7 +2360,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset, } while (ret == -EAGAIN); out: - btrfs_free_path(path); + btrfs_release_path(path); if (btrfs_is_data_reloc_root(root)) WARN_ON(ret > 0); return ret; @@ -2497,24 +2481,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) return ret; } -static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start) +static u64 first_logical_byte(struct btrfs_fs_info *fs_info) { - struct btrfs_block_group *cache; - u64 bytenr; - - spin_lock(&fs_info->block_group_cache_lock); - bytenr = fs_info->first_logical_byte; - spin_unlock(&fs_info->block_group_cache_lock); + struct rb_node *leftmost; + u64 bytenr = 0; - if (bytenr < (u64)-1) - return bytenr; + read_lock(&fs_info->block_group_cache_lock); + /* Get the block group with the lowest logical start address. */ + leftmost = rb_first_cached(&fs_info->block_group_cache_tree); + if (leftmost) { + struct btrfs_block_group *bg; - cache = btrfs_lookup_first_block_group(fs_info, search_start); - if (!cache) - return 0; - - bytenr = cache->start; - btrfs_put_block_group(cache); + bg = rb_entry(leftmost, struct btrfs_block_group, cache_node); + bytenr = bg->start; + } + read_unlock(&fs_info->block_group_cache_lock); return bytenr; } @@ -3803,8 +3784,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, /* Check RO and no space case before trying to activate it */ spin_lock(&block_group->lock); - if (block_group->ro || - block_group->alloc_offset == block_group->zone_capacity) { + if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) { ret = 1; /* * May need to clear fs_info->{treelog,data_reloc}_bg. @@ -3836,7 +3816,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3898,8 +3878,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); @@ -3969,23 +3965,63 @@ static void found_extent(struct find_free_extent_ctl *ffe_ctl, } } -static bool can_allocate_chunk(struct btrfs_fs_info *fs_info, - struct find_free_extent_ctl *ffe_ctl) +static int can_allocate_chunk_zoned(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) +{ + /* If we can activate new zone, just allocate a chunk and use it */ + if (btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) + return 0; + + /* + * We already reached the max active zones. Try to finish one block + * group to make a room for a new block group. This is only possible + * for a data block group because btrfs_zone_finish() may need to wait + * for a running transaction which can cause a deadlock for metadata + * allocation. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) { + int ret = btrfs_zone_finish_one_bg(fs_info); + + if (ret == 1) + return 0; + else if (ret < 0) + return ret; + } + + /* + * If we have enough free space left in an already active block group + * and we can't activate any other zone now, do not allow allocating a + * new chunk and let find_free_extent() retry with a smaller size. + */ + if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size) + return -ENOSPC; + + /* + * Even min_alloc_size is not left in any block groups. Since we cannot + * activate a new block group, allocating it may not help. Let's tell a + * caller to try again and hope it progress something by writing some + * parts of the region. That is only possible for data block groups, + * where a part of the region can be written. + */ + if (ffe_ctl->flags & BTRFS_BLOCK_GROUP_DATA) + return -EAGAIN; + + /* + * We cannot activate a new block group and no enough space left in any + * block groups. So, allocating a new block group may not help. But, + * there is nothing to do anyway, so let's go with it. + */ + return 0; +} + +static int can_allocate_chunk(struct btrfs_fs_info *fs_info, + struct find_free_extent_ctl *ffe_ctl) { switch (ffe_ctl->policy) { case BTRFS_EXTENT_ALLOC_CLUSTERED: - return true; + return 0; case BTRFS_EXTENT_ALLOC_ZONED: - /* - * If we have enough free space left in an already - * active block group and we can't activate any other - * zone now, do not allow allocating a new chunk and - * let find_free_extent() retry with a smaller size. - */ - if (ffe_ctl->max_extent_size >= ffe_ctl->min_alloc_size && - !btrfs_can_activate_zone(fs_info->fs_devices, ffe_ctl->flags)) - return false; - return true; + return can_allocate_chunk_zoned(fs_info, ffe_ctl); default: BUG(); } @@ -4067,8 +4103,9 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, int exist = 0; /*Check if allocation policy allows to create a new chunk */ - if (!can_allocate_chunk(fs_info, ffe_ctl)) - return -ENOSPC; + ret = can_allocate_chunk(fs_info, ffe_ctl); + if (ret) + return ret; trans = current->journal_info; if (trans) @@ -4272,7 +4309,7 @@ static noinline int find_free_extent(struct btrfs_root *root, return ret; ffe_ctl->search_start = max(ffe_ctl->search_start, - first_logical_byte(fs_info, 0)); + first_logical_byte(fs_info)); ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte); if (ffe_ctl->search_start == ffe_ctl->hint_byte) { block_group = btrfs_lookup_block_group(fs_info, @@ -4959,7 +4996,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, extent_op->flags_to_set = flags; extent_op->update_key = skinny_metadata ? false : true; extent_op->update_flags = true; - extent_op->is_data = false; extent_op->level = level; btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT, @@ -5144,7 +5180,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_dec_ref(trans, root, eb, 0); BUG_ON(ret); /* -ENOMEM */ ret = btrfs_set_disk_extent_flags(trans, eb, flag, - btrfs_header_level(eb), 0); + btrfs_header_level(eb)); BUG_ON(ret); /* -ENOMEM */ wc->flags[level] |= flag; } @@ -5981,7 +6017,7 @@ int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info, */ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) { - u64 start = SZ_1M, len = 0, end = 0; + u64 start = BTRFS_DEVICE_RANGE_RESERVED, len = 0, end = 0; int ret; *trimmed = 0; @@ -6025,8 +6061,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) break; } - /* Ensure we skip the reserved area in the first 1M */ - start = max_t(u64, start, SZ_1M); + /* Ensure we skip the reserved space on each device. */ + start = max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); /* * If find_first_clear_extent_bit find a range that spans the diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 33c19f51d79b..bfae67c593c5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/pagemap.h> #include <linux/page-flags.h> +#include <linux/sched/mm.h> #include <linux/spinlock.h> #include <linux/blkdev.h> #include <linux/swap.h> @@ -28,6 +29,7 @@ #include "subpage.h" #include "zoned.h" #include "block-group.h" +#include "compression.h" static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; @@ -75,6 +77,7 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) if (!fs_info->allocated_ebs.next) return; + WARN_ON(!list_empty(&fs_info->allocated_ebs)); spin_lock_irqsave(&fs_info->eb_leak_lock, flags); while (!list_empty(&fs_info->allocated_ebs)) { eb = list_first_entry(&fs_info->allocated_ebs, @@ -135,6 +138,18 @@ struct tree_entry { struct rb_node rb_node; }; +/* + * Structure to record info about the bio being assembled, and other info like + * how many bytes are there before stripe/ordered extent boundary. + */ +struct btrfs_bio_ctrl { + struct bio *bio; + int mirror_num; + enum btrfs_compression_type compress_type; + u32 len_to_stripe_boundary; + u32 len_to_oe_boundary; +}; + struct extent_page_data { struct btrfs_bio_ctrl bio_ctrl; /* tells writepage not to lock the state bits for this range @@ -164,61 +179,57 @@ static int add_extent_changeset(struct extent_state *state, u32 bits, return ret; } -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +static void submit_one_bio(struct btrfs_bio_ctrl *bio_ctrl) { - blk_status_t ret = 0; - struct extent_io_tree *tree = bio->bi_private; + struct bio *bio; + struct bio_vec *bv; + struct inode *inode; + int mirror_num; + + if (!bio_ctrl->bio) + return; - bio->bi_private = NULL; + bio = bio_ctrl->bio; + bv = bio_first_bvec_all(bio); + inode = bv->bv_page->mapping->host; + mirror_num = bio_ctrl->mirror_num; /* Caller should ensure the bio has at least some range added */ ASSERT(bio->bi_iter.bi_size); - if (is_data_inode(tree->private_data)) - ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, - bio_flags); - else - ret = btrfs_submit_metadata_bio(tree->private_data, bio, - mirror_num, bio_flags); - return blk_status_to_errno(ret); -} + btrfs_bio(bio)->file_offset = page_offset(bv->bv_page) + bv->bv_offset; -/* Cleanup unsubmitted bios */ -static void end_write_bio(struct extent_page_data *epd, int ret) -{ - struct bio *bio = epd->bio_ctrl.bio; + if (!is_data_inode(inode)) + btrfs_submit_metadata_bio(inode, bio, mirror_num); + else if (btrfs_op(bio) == BTRFS_MAP_WRITE) + btrfs_submit_data_write_bio(inode, bio, mirror_num); + else + btrfs_submit_data_read_bio(inode, bio, mirror_num, + bio_ctrl->compress_type); - if (bio) { - bio->bi_status = errno_to_blk_status(ret); - bio_endio(bio); - epd->bio_ctrl.bio = NULL; - } + /* The bio is owned by the bi_end_io handler now */ + bio_ctrl->bio = NULL; } /* - * Submit bio from extent page data via submit_one_bio - * - * Return 0 if everything is OK. - * Return <0 for error. + * Submit or fail the current bio in an extent_page_data structure. */ -static int __must_check flush_write_bio(struct extent_page_data *epd) +static void submit_write_bio(struct extent_page_data *epd, int ret) { - int ret = 0; struct bio *bio = epd->bio_ctrl.bio; - if (bio) { - ret = submit_one_bio(bio, 0, 0); - /* - * Clean up of epd->bio is handled by its endio function. - * And endio is either triggered by successful bio execution - * or the error handler of submit bio hook. - * So at this point, no matter what happened, we don't need - * to clean up epd->bio. - */ + if (!bio) + return; + + if (ret) { + ASSERT(ret < 0); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + /* The bio is owned by the bi_end_io handler now */ epd->bio_ctrl.bio = NULL; + } else { + submit_one_bio(&epd->bio_ctrl); } - return ret; } int __init extent_state_cache_init(void) @@ -361,131 +372,121 @@ void free_extent_state(struct extent_state *state) } } -static struct rb_node *tree_insert(struct rb_root *root, - struct rb_node *search_start, - u64 offset, - struct rb_node *node, - struct rb_node ***p_in, - struct rb_node **parent_in) -{ - struct rb_node **p; - struct rb_node *parent = NULL; - struct tree_entry *entry; - - if (p_in && parent_in) { - p = *p_in; - parent = *parent_in; - goto do_insert; - } - - p = search_start ? &search_start : &root->rb_node; - while (*p) { - parent = *p; - entry = rb_entry(parent, struct tree_entry, rb_node); - - if (offset < entry->start) - p = &(*p)->rb_left; - else if (offset > entry->end) - p = &(*p)->rb_right; - else - return parent; - } - -do_insert: - rb_link_node(node, parent, p); - rb_insert_color(node, root); - return NULL; -} - /** * Search @tree for an entry that contains @offset. Such entry would have * entry->start <= offset && entry->end >= offset. * * @tree: the tree to search * @offset: offset that should fall within an entry in @tree - * @next_ret: pointer to the first entry whose range ends after @offset - * @prev_ret: pointer to the first entry whose range begins before @offset - * @p_ret: pointer where new node should be anchored (used when inserting an + * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * This function returns a pointer to the entry that contains @offset byte - * address. If no such entry exists, then NULL is returned and the other - * pointer arguments to the function are filled, otherwise the found entry is - * returned and other pointers are left untouched. + * Return a pointer to the entry that contains @offset byte address and don't change + * @node_ret and @parent_ret. + * + * If no such entry exists, return pointer to entry that ends before @offset + * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. */ -static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, - struct rb_node **next_ret, - struct rb_node **prev_ret, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +static inline struct rb_node *tree_search_for_insert(struct extent_io_tree *tree, + u64 offset, + struct rb_node ***node_ret, + struct rb_node **parent_ret) { struct rb_root *root = &tree->state; - struct rb_node **n = &root->rb_node; + struct rb_node **node = &root->rb_node; struct rb_node *prev = NULL; - struct rb_node *orig_prev = NULL; struct tree_entry *entry; - struct tree_entry *prev_entry = NULL; - while (*n) { - prev = *n; + while (*node) { + prev = *node; entry = rb_entry(prev, struct tree_entry, rb_node); - prev_entry = entry; if (offset < entry->start) - n = &(*n)->rb_left; + node = &(*node)->rb_left; else if (offset > entry->end) - n = &(*n)->rb_right; + node = &(*node)->rb_right; else - return *n; + return *node; } - if (p_ret) - *p_ret = n; + if (node_ret) + *node_ret = node; if (parent_ret) *parent_ret = prev; - if (next_ret) { - orig_prev = prev; - while (prev && offset > prev_entry->end) { - prev = rb_next(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *next_ret = prev; - prev = orig_prev; + /* Search neighbors until we find the first one past the end */ + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); } - if (prev_ret) { - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - while (prev && offset < prev_entry->start) { - prev = rb_prev(prev); - prev_entry = rb_entry(prev, struct tree_entry, rb_node); - } - *prev_ret = prev; - } - return NULL; + return prev; } -static inline struct rb_node * -tree_search_for_insert(struct extent_io_tree *tree, - u64 offset, - struct rb_node ***p_ret, - struct rb_node **parent_ret) +/* + * Inexact rb-tree search, return the next entry if @offset is not found + */ +static inline struct rb_node *tree_search(struct extent_io_tree *tree, u64 offset) { - struct rb_node *next= NULL; - struct rb_node *ret; - - ret = __etree_search(tree, offset, &next, NULL, p_ret, parent_ret); - if (!ret) - return next; - return ret; + return tree_search_for_insert(tree, offset, NULL, NULL); } -static inline struct rb_node *tree_search(struct extent_io_tree *tree, - u64 offset) +/** + * Search offset in the tree or fill neighbor rbtree node pointers. + * + * @tree: the tree to search + * @offset: offset that should fall within an entry in @tree + * @next_ret: pointer to the first entry whose range ends after @offset + * @prev_ret: pointer to the first entry whose range begins before @offset + * + * Return a pointer to the entry that contains @offset byte address. If no + * such entry exists, then return NULL and fill @prev_ret and @next_ret. + * Otherwise return the found entry and other pointers are left untouched. + */ +static struct rb_node *tree_search_prev_next(struct extent_io_tree *tree, + u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) { - return tree_search_for_insert(tree, offset, NULL, NULL); + struct rb_root *root = &tree->state; + struct rb_node **node = &root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + + ASSERT(prev_ret); + ASSERT(next_ret); + + while (*node) { + prev = *node; + entry = rb_entry(prev, struct tree_entry, rb_node); + + if (offset < entry->start) + node = &(*node)->rb_left; + else if (offset > entry->end) + node = &(*node)->rb_right; + else + return *node; + } + + orig_prev = prev; + while (prev && offset > entry->end) { + prev = rb_next(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + prev = orig_prev; + + entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < entry->start) { + prev = rb_prev(prev); + entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + + return NULL; } /* @@ -539,7 +540,7 @@ static void merge_state(struct extent_io_tree *tree, } static void set_state_bits(struct extent_io_tree *tree, - struct extent_state *state, u32 *bits, + struct extent_state *state, u32 bits, struct extent_changeset *changeset); /* @@ -553,37 +554,56 @@ static void set_state_bits(struct extent_io_tree *tree, * probably isn't what you want to call (see set/clear_extent_bit). */ static int insert_state(struct extent_io_tree *tree, - struct extent_state *state, u64 start, u64 end, - struct rb_node ***p, - struct rb_node **parent, - u32 *bits, struct extent_changeset *changeset) + struct extent_state *state, + u32 bits, struct extent_changeset *changeset) { - struct rb_node *node; - - if (end < start) { - btrfs_err(tree->fs_info, - "insert state: end < start %llu %llu", end, start); - WARN_ON(1); - } - state->start = start; - state->end = end; + struct rb_node **node; + struct rb_node *parent; + const u64 end = state->end; set_state_bits(tree, state, bits, changeset); - node = tree_insert(&tree->state, NULL, end, &state->rb_node, p, parent); - if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - found->start, found->end, start, end); - return -EEXIST; + node = &tree->state.rb_node; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (end < entry->start) { + node = &(*node)->rb_left; + } else if (end > entry->end) { + node = &(*node)->rb_right; + } else { + btrfs_err(tree->fs_info, + "found node %llu %llu on insert of %llu %llu", + entry->start, entry->end, state->start, end); + return -EEXIST; + } } + + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); return 0; } /* + * Insert state to @tree to the location given by @node and @parent. + */ +static void insert_state_fast(struct extent_io_tree *tree, + struct extent_state *state, struct rb_node **node, + struct rb_node *parent, unsigned bits, + struct extent_changeset *changeset) +{ + set_state_bits(tree, state, bits, changeset); + rb_link_node(&state->rb_node, parent, node); + rb_insert_color(&state->rb_node, &tree->state); + merge_state(tree, state); +} + +/* * split a given extent state struct in two, inserting the preallocated * struct 'prealloc' as the newly created second half. 'split' indicates an * offset inside 'orig' where it should be split. @@ -600,7 +620,8 @@ static int insert_state(struct extent_io_tree *tree, static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct extent_state *prealloc, u64 split) { - struct rb_node *node; + struct rb_node *parent = NULL; + struct rb_node **node; if (tree->private_data && is_data_inode(tree->private_data)) btrfs_split_delalloc_extent(tree->private_data, orig, split); @@ -610,12 +631,27 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, prealloc->state = orig->state; orig->start = split; - node = tree_insert(&tree->state, &orig->rb_node, prealloc->end, - &prealloc->rb_node, NULL, NULL); - if (node) { - free_extent_state(prealloc); - return -EEXIST; + parent = &orig->rb_node; + node = &parent; + while (*node) { + struct tree_entry *entry; + + parent = *node; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (prealloc->end < entry->start) { + node = &(*node)->rb_left; + } else if (prealloc->end > entry->end) { + node = &(*node)->rb_right; + } else { + free_extent_state(prealloc); + return -EEXIST; + } } + + rb_link_node(&prealloc->rb_node, parent, node); + rb_insert_color(&prealloc->rb_node, &tree->state); + return 0; } @@ -637,11 +673,11 @@ static struct extent_state *next_state(struct extent_state *state) */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, int wake, + u32 bits, int wake, struct extent_changeset *changeset) { struct extent_state *next; - u32 bits_to_clear = *bits & ~EXTENT_CTLBITS; + u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; if ((bits_to_clear & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { @@ -803,8 +839,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - state = clear_state_bit(tree, state, &bits, wake, - changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); goto next; } goto search_again; @@ -825,13 +860,13 @@ hit_next: if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, &bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, &bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, changeset); next: if (last_end == (u64)-1) goto out; @@ -922,9 +957,9 @@ out: static void set_state_bits(struct extent_io_tree *tree, struct extent_state *state, - u32 *bits, struct extent_changeset *changeset) + u32 bits, struct extent_changeset *changeset) { - u32 bits_to_set = *bits & ~EXTENT_CTLBITS; + u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; if (tree->private_data && is_data_inode(tree->private_data)) @@ -1018,11 +1053,9 @@ again: if (!node) { prealloc = alloc_extent_state_atomic(prealloc); BUG_ON(!prealloc); - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, changeset); - if (err) - extent_io_tree_panic(tree, err); - + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, changeset); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1045,7 +1078,7 @@ hit_next: goto out; } - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1101,7 +1134,7 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, changeset); + set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); if (last_end == (u64)-1) @@ -1135,8 +1168,9 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, changeset); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, changeset); if (err) extent_io_tree_panic(tree, err); @@ -1164,7 +1198,7 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, changeset); + set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); merge_state(tree, prealloc); prealloc = NULL; @@ -1259,10 +1293,9 @@ again: err = -ENOMEM; goto out; } - err = insert_state(tree, prealloc, start, end, - &p, &parent, &bits, NULL); - if (err) - extent_io_tree_panic(tree, err); + prealloc->start = start; + prealloc->end = end; + insert_state_fast(tree, prealloc, p, parent, bits, NULL); cache_state(prealloc, cached_state); prealloc = NULL; goto out; @@ -1279,9 +1312,9 @@ hit_next: * Just lock what we found and keep going */ if (state->start == start && state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1320,10 +1353,9 @@ hit_next: if (err) goto out; if (state->end <= end) { - set_state_bits(tree, state, &bits, NULL); + set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, &clear_bits, 0, - NULL); + state = clear_state_bit(tree, state, clear_bits, 0, NULL); if (last_end == (u64)-1) goto out; start = last_end + 1; @@ -1357,8 +1389,9 @@ hit_next: * Avoid to free 'prealloc' if it can be merged with * the later extent. */ - err = insert_state(tree, prealloc, start, this_end, - NULL, NULL, &bits, NULL); + prealloc->start = start; + prealloc->end = this_end; + err = insert_state(tree, prealloc, bits, NULL); if (err) extent_io_tree_panic(tree, err); cache_state(prealloc, cached_state); @@ -1383,9 +1416,9 @@ hit_next: if (err) extent_io_tree_panic(tree, err); - set_state_bits(tree, prealloc, &bits, NULL); + set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, &clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, NULL); prealloc = NULL; goto out; } @@ -1659,7 +1692,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, /* Find first extent with bits cleared */ while (1) { - node = __etree_search(tree, start, &next, &prev, NULL, NULL); + node = tree_search_prev_next(tree, start, &prev, &next); if (!node && !next && !prev) { /* * Tree is completely empty, send full range and let @@ -1992,10 +2025,12 @@ noinline_for_stack bool find_lock_delalloc_range(struct inode *inode, struct page *locked_page, u64 *start, u64 *end) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; const u64 orig_start = *start; const u64 orig_end = *end; - u64 max_bytes = BTRFS_MAX_EXTENT_SIZE; + /* The sanity tests may not set a valid fs_info. */ + u64 max_bytes = fs_info ? fs_info->max_extent_size : BTRFS_MAX_EXTENT_SIZE; u64 delalloc_start; u64 delalloc_end; bool found; @@ -2303,12 +2338,13 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num) { - struct bio *bio; struct btrfs_device *dev; + struct bio_vec bvec; + struct bio bio; u64 map_length = 0; u64 sector; struct btrfs_io_context *bioc = NULL; - int ret; + int ret = 0; ASSERT(!(fs_info->sb->s_flags & SB_RDONLY)); BUG_ON(!mirror_num); @@ -2316,8 +2352,6 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, if (btrfs_repair_one_zone(fs_info, logical)) return 0; - bio = btrfs_bio_alloc(1); - bio->bi_iter.bi_size = 0; map_length = length; /* @@ -2335,52 +2369,50 @@ static int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, */ ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical, &map_length, &bioc, 0); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; ASSERT(bioc->mirror_num == 1); } else { ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, mirror_num); - if (ret) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; - } + if (ret) + goto out_counter_dec; BUG_ON(mirror_num != bioc->mirror_num); } sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9; - bio->bi_iter.bi_sector = sector; dev = bioc->stripes[bioc->mirror_num - 1].dev; btrfs_put_bioc(bioc); + if (!dev || !dev->bdev || !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) { - btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return -EIO; + ret = -EIO; + goto out_counter_dec; } - bio_set_dev(bio, dev->bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_SYNC; - bio_add_page(bio, page, length, pg_offset); - if (btrfsic_submit_bio_wait(bio)) { + bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); + bio.bi_iter.bi_sector = sector; + __bio_add_page(&bio, page, length, pg_offset); + + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + if (ret) { /* try to remap that extent elsewhere? */ - btrfs_bio_counter_dec(fs_info); - bio_put(bio); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS); - return -EIO; + goto out_bio_uninit; } btrfs_info_rl_in_rcu(fs_info, "read error corrected: ino %llu off %llu (dev %s sector %llu)", ino, start, rcu_str_deref(dev->name), sector); + ret = 0; + +out_bio_uninit: + bio_uninit(&bio); +out_counter_dec: btrfs_bio_counter_dec(fs_info); - bio_put(bio); - return 0; + return ret; } int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) @@ -2406,6 +2438,20 @@ int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) return ret; } +static int next_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == failrec->num_copies) + return cur_mirror + 1 - failrec->num_copies; + return cur_mirror + 1; +} + +static int prev_mirror(const struct io_failure_record *failrec, int cur_mirror) +{ + if (cur_mirror == 1) + return failrec->num_copies; + return cur_mirror - 1; +} + /* * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record @@ -2418,7 +2464,7 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, u64 private; struct io_failure_record *failrec; struct extent_state *state; - int num_copies; + int mirror; int ret; private = 0; @@ -2442,20 +2488,19 @@ int clean_io_failure(struct btrfs_fs_info *fs_info, EXTENT_LOCKED); spin_unlock(&io_tree->lock); - if (state && state->start <= failrec->start && - state->end >= failrec->start + failrec->len - 1) { - num_copies = btrfs_num_copies(fs_info, failrec->logical, - failrec->len); - if (num_copies > 1) { - repair_io_failure(fs_info, ino, start, failrec->len, - failrec->logical, page, pg_offset, - failrec->failed_mirror); - } - } + if (!state || state->start > failrec->start || + state->end < failrec->start + failrec->len - 1) + goto out; + + mirror = failrec->this_mirror; + do { + mirror = prev_mirror(failrec, mirror); + repair_io_failure(fs_info, ino, start, failrec->len, + failrec->logical, page, pg_offset, mirror); + } while (mirror != failrec->failed_mirror); out: free_io_failure(failure_tree, io_tree, failrec); - return 0; } @@ -2494,17 +2539,16 @@ void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end) } static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode, - u64 start) + struct btrfs_bio *bbio, + unsigned int bio_offset) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + u64 start = bbio->file_offset + bio_offset; struct io_failure_record *failrec; - struct extent_map *em; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; const u32 sectorsize = fs_info->sectorsize; int ret; - u64 logical; failrec = get_state_failrec(failure_tree, start); if (!IS_ERR(failrec)) { @@ -2516,7 +2560,8 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode * (e.g. with a list for failed_mirror) to make * clean_io_failure() clean all those errors at once. */ - + ASSERT(failrec->this_mirror == bbio->mirror_num); + ASSERT(failrec->len == fs_info->sectorsize); return failrec; } @@ -2526,42 +2571,28 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode failrec->start = start; failrec->len = sectorsize; - failrec->this_mirror = 0; - failrec->bio_flags = 0; + failrec->failed_mirror = bbio->mirror_num; + failrec->this_mirror = bbio->mirror_num; + failrec->logical = (bbio->iter.bi_sector << SECTOR_SHIFT) + bio_offset; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, failrec->len); - if (!em) { - read_unlock(&em_tree->lock); - kfree(failrec); - return ERR_PTR(-EIO); - } + btrfs_debug(fs_info, + "new io failure record logical %llu start %llu", + failrec->logical, start); - if (em->start > start || em->start + em->len <= start) { - free_extent_map(em); - em = NULL; - } - read_unlock(&em_tree->lock); - if (!em) { + failrec->num_copies = btrfs_num_copies(fs_info, failrec->logical, sectorsize); + if (failrec->num_copies == 1) { + /* + * We only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. No + * matter what the error is, it is very likely to persist. + */ + btrfs_debug(fs_info, + "cannot repair logical %llu num_copies %d", + failrec->logical, failrec->num_copies); kfree(failrec); return ERR_PTR(-EIO); } - logical = start - em->start; - logical = em->block_start + logical; - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - logical = em->block_start; - failrec->bio_flags = EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&failrec->bio_flags, em->compress_type); - } - - btrfs_debug(fs_info, - "Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu", - logical, start, failrec->len); - - failrec->logical = logical; - free_extent_map(em); - /* Set the bits in the private failure tree */ ret = set_extent_bits(failure_tree, start, start + sectorsize - 1, EXTENT_LOCKED | EXTENT_DIRTY); @@ -2578,65 +2609,16 @@ static struct io_failure_record *btrfs_get_io_failure_record(struct inode *inode return failrec; } -static bool btrfs_check_repairable(struct inode *inode, - struct io_failure_record *failrec, - int failed_mirror) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - int num_copies; - - num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); - if (num_copies == 1) { - /* - * we only have a single copy of the data, so don't bother with - * all the retry and error correction code that follows. no - * matter what the error is, it is very likely to persist. - */ - btrfs_debug(fs_info, - "Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - /* The failure record should only contain one sector */ - ASSERT(failrec->len == fs_info->sectorsize); - - /* - * There are two premises: - * a) deliver good data to the caller - * b) correct the bad sectors on disk - * - * Since we're only doing repair for one sector, we only need to get - * a good copy of the failed sector and if we succeed, we have setup - * everything for repair_io_failure to do the rest for us. - */ - ASSERT(failed_mirror); - failrec->failed_mirror = failed_mirror; - failrec->this_mirror++; - if (failrec->this_mirror == failed_mirror) - failrec->this_mirror++; - - if (failrec->this_mirror > num_copies) { - btrfs_debug(fs_info, - "Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d", - num_copies, failrec->this_mirror, failed_mirror); - return false; - } - - return true; -} - -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook) { + u64 start = failed_bbio->file_offset + bio_offset; struct io_failure_record *failrec; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; - struct btrfs_bio *failed_bbio = btrfs_bio(failed_bio); + struct bio *failed_bio = &failed_bbio->bio; const int icsum = bio_offset >> fs_info->sectorsize_bits; struct bio *repair_bio; struct btrfs_bio *repair_bbio; @@ -2646,12 +2628,24 @@ int btrfs_repair_one_sector(struct inode *inode, BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); - failrec = btrfs_get_io_failure_record(inode, start); + failrec = btrfs_get_io_failure_record(inode, failed_bbio, bio_offset); if (IS_ERR(failrec)) return PTR_ERR(failrec); - - if (!btrfs_check_repairable(inode, failrec, failed_mirror)) { + /* + * There are two premises: + * a) deliver good data to the caller + * b) correct the bad sectors on disk + * + * Since we're only doing repair for one sector, we only need to get + * a good copy of the failed sector and if we succeed, we have setup + * everything for repair_io_failure to do the rest for us. + */ + failrec->this_mirror = next_mirror(failrec, failrec->this_mirror); + if (failrec->this_mirror == failrec->failed_mirror) { + btrfs_debug(fs_info, + "failed to repair num_copies %d this_mirror %d failed_mirror %d", + failrec->num_copies, failrec->this_mirror, failrec->failed_mirror); free_io_failure(failure_tree, tree, failrec); return -EIO; } @@ -2684,7 +2678,7 @@ int btrfs_repair_one_sector(struct inode *inode, * will be handled by the endio on the repair_bio, so we can't return an * error here. */ - submit_bio_hook(inode, repair_bio, failrec->this_mirror, failrec->bio_flags); + submit_bio_hook(inode, repair_bio, failrec->this_mirror, 0); return BLK_STS_OK; } @@ -2710,26 +2704,44 @@ static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) btrfs_page_set_error(fs_info, page, start, len); } - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) unlock_page(page); else btrfs_subpage_end_reader(fs_info, page, start, len); } -static blk_status_t submit_read_repair(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, u64 end, int failed_mirror, - unsigned int error_bitmap, - submit_bio_hook_t *submit_bio_hook) +static void end_sector_io(struct page *page, u64 offset, bool uptodate) { + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + const u32 sectorsize = inode->root->fs_info->sectorsize; + struct extent_state *cached = NULL; + + end_page_read(page, uptodate, offset, sectorsize); + if (uptodate) + set_extent_uptodate(&inode->io_tree, offset, + offset + sectorsize - 1, &cached, GFP_ATOMIC); + unlock_extent_cached_atomic(&inode->io_tree, offset, + offset + sectorsize - 1, &cached); +} + +static void submit_data_read_repair(struct inode *inode, + struct btrfs_bio *failed_bbio, + u32 bio_offset, const struct bio_vec *bvec, + unsigned int error_bitmap) +{ + const unsigned int pgoff = bvec->bv_offset; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct page *page = bvec->bv_page; + const u64 start = page_offset(bvec->bv_page) + bvec->bv_offset; + const u64 end = start + bvec->bv_len - 1; const u32 sectorsize = fs_info->sectorsize; const int nr_bits = (end + 1 - start) >> fs_info->sectorsize_bits; - int error = 0; int i; - BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE); + BUG_ON(bio_op(&failed_bbio->bio) == REQ_OP_WRITE); + + /* This repair is only for data */ + ASSERT(is_data_inode(inode)); /* We're here because we had some read errors or csum mismatch */ ASSERT(error_bitmap); @@ -2738,12 +2750,11 @@ static blk_status_t submit_read_repair(struct inode *inode, * We only get called on buffered IO, thus page must be mapped and bio * must not be cloned. */ - ASSERT(page->mapping && !bio_flagged(failed_bio, BIO_CLONED)); + ASSERT(page->mapping && !bio_flagged(&failed_bbio->bio, BIO_CLONED)); /* Iterate through all the sectors in the range */ for (i = 0; i < nr_bits; i++) { const unsigned int offset = i * sectorsize; - struct extent_state *cached = NULL; bool uptodate = false; int ret; @@ -2756,10 +2767,9 @@ static blk_status_t submit_read_repair(struct inode *inode, goto next; } - ret = btrfs_repair_one_sector(inode, failed_bio, - bio_offset + offset, - page, pgoff + offset, start + offset, - failed_mirror, submit_bio_hook); + ret = btrfs_repair_one_sector(inode, failed_bbio, + bio_offset + offset, page, pgoff + offset, + btrfs_submit_data_read_bio); if (!ret) { /* * We have submitted the read repair, the page release @@ -2770,24 +2780,12 @@ static blk_status_t submit_read_repair(struct inode *inode, continue; } /* - * Repair failed, just record the error but still continue. - * Or the remaining sectors will not be properly unlocked. + * Continue on failed repair, otherwise the remaining sectors + * will not be properly unlocked. */ - if (!error) - error = ret; next: - end_page_read(page, uptodate, start + offset, sectorsize); - if (uptodate) - set_extent_uptodate(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached, GFP_ATOMIC); - unlock_extent_cached_atomic(&BTRFS_I(inode)->io_tree, - start + offset, - start + offset + sectorsize - 1, - &cached); + end_sector_io(page, start + offset, uptodate); } - return errno_to_blk_status(error); } /* lots and lots of room for performance fixes in the end_bio funcs */ @@ -2943,7 +2941,7 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { ASSERT(PageLocked(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page)); @@ -2965,7 +2963,7 @@ static struct extent_buffer *find_extent_buffer_readpage( * For regular sectorsize, we can use page->private to grab extent * buffer */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { ASSERT(PagePrivate(page) && page->private); return (struct extent_buffer *)page->private; } @@ -3002,7 +3000,6 @@ static void end_bio_extent_readpage(struct bio *bio) */ u32 bio_offset = 0; int mirror; - int ret; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); @@ -3013,6 +3010,7 @@ static void end_bio_extent_readpage(struct bio *bio) struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; unsigned int error_bitmap = (unsigned int)-1; + bool repair = false; u64 start; u64 end; u32 len; @@ -3050,57 +3048,23 @@ static void end_bio_extent_readpage(struct bio *bio) if (is_data_inode(inode)) { error_bitmap = btrfs_verify_data_csum(bbio, bio_offset, page, start, end); - ret = error_bitmap; + if (error_bitmap) + uptodate = false; } else { - ret = btrfs_validate_metadata_buffer(bbio, - page, start, end, mirror); + if (btrfs_validate_metadata_buffer(bbio, + page, start, end, mirror)) + uptodate = false; } - if (ret) - uptodate = false; - else - clean_io_failure(BTRFS_I(inode)->root->fs_info, - failure_tree, tree, start, - page, - btrfs_ino(BTRFS_I(inode)), 0); } - if (likely(uptodate)) - goto readpage_ok; - - if (is_data_inode(inode)) { - /* - * If we failed to submit the IO at all we'll have a - * mirror_num == 0, in which case we need to just mark - * the page with an error and unlock it and carry on. - */ - if (mirror == 0) - goto readpage_ok; - - /* - * btrfs_submit_read_repair() will handle all the good - * and bad sectors, we just continue to the next bvec. - */ - submit_read_repair(inode, bio, bio_offset, page, - start - page_offset(page), start, - end, mirror, error_bitmap, - btrfs_submit_data_bio); - - ASSERT(bio_offset + len > bio_offset); - bio_offset += len; - continue; - } else { - struct extent_buffer *eb; - - eb = find_extent_buffer_readpage(fs_info, page, start); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - eb->read_mirror = mirror; - atomic_dec(&eb->io_pages); - } -readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_SHIFT; + clean_io_failure(BTRFS_I(inode)->root->fs_info, + failure_tree, tree, start, page, + btrfs_ino(BTRFS_I(inode)), 0); + /* * Zero out the remaining part if this range straddles * i_size. @@ -3117,14 +3081,44 @@ readpage_ok: zero_user_segment(page, zero_start, offset_in_page(end) + 1); } + } else if (is_data_inode(inode)) { + /* + * Only try to repair bios that actually made it to a + * device. If the bio failed to be submitted mirror + * is 0 and we need to fail it without retrying. + * + * This also includes the high level bios for compressed + * extents - these never make it to a device and repair + * is already handled on the lower compressed bio. + */ + if (mirror > 0) + repair = true; + } else { + struct extent_buffer *eb; + + eb = find_extent_buffer_readpage(fs_info, page, start); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); + eb->read_mirror = mirror; + atomic_dec(&eb->io_pages); } + + if (repair) { + /* + * submit_data_read_repair() will handle all the good + * and bad sectors, we just continue to the next bvec. + */ + submit_data_read_repair(inode, bbio, bio_offset, bvec, + error_bitmap); + } else { + /* Update page status and unlock */ + end_page_read(page, uptodate, start, len); + endio_readpage_release_extent(&processed, BTRFS_I(inode), + start, end, PageUptodate(page)); + } + ASSERT(bio_offset + len > bio_offset); bio_offset += len; - /* Update page status and unlock */ - end_page_read(page, uptodate, start, len); - endio_readpage_release_extent(&processed, BTRFS_I(inode), - start, end, PageUptodate(page)); } /* Release the last extent */ endio_readpage_release_extent(&processed, NULL, 0, 0, false); @@ -3132,6 +3126,42 @@ readpage_ok: bio_put(bio); } +/** + * Populate every free slot in a provided array with pages. + * + * @nr_pages: number of pages to allocate + * @page_array: the array to fill with pages; any existing non-null entries in + * the array will be skipped + * + * Return: 0 if all pages were able to be allocated; + * -ENOMEM otherwise, and the caller is responsible for freeing all + * non-null page pointers in the array. + */ +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +{ + unsigned int allocated; + + for (allocated = 0; allocated < nr_pages;) { + unsigned int last = allocated; + + allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + + if (allocated == nr_pages) + return 0; + + /* + * During this iteration, no page could be allocated, even + * though alloc_pages_bulk_array() falls back to alloc_page() + * if it could not bulk-allocate. So we must be out of memory. + */ + if (allocated == last) + return -ENOMEM; + + memalloc_retry_wait(GFP_NOFS); + } + return 0; +} + /* * Initialize the members up to but not including 'bio'. Use after allocating a * new bio by bio_alloc_bioset as it does not initialize the bytes outside of @@ -3157,19 +3187,6 @@ struct bio *btrfs_bio_alloc(unsigned int nr_iovecs) return bio; } -struct bio *btrfs_bio_clone(struct bio *bio) -{ - struct btrfs_bio *bbio; - struct bio *new; - - /* Bio allocation backed by a bioset does not fail */ - new = bio_alloc_clone(bio->bi_bdev, bio, GFP_NOFS, &btrfs_bioset); - bbio = btrfs_bio(new); - btrfs_bio_init(bbio); - bbio->iter = bio->bi_iter; - return new; -} - struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) { struct bio *bio; @@ -3198,7 +3215,7 @@ struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size) * a contiguous page to the previous one * @size: portion of page that we want to write * @pg_offset: starting offset in the page - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compression type of the current bio to see if we can merge them * * Attempt to add a page to bio considering stripe alignment etc. * @@ -3210,7 +3227,7 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int size, unsigned int pg_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct bio *bio = bio_ctrl->bio; u32 bio_size = bio->bi_iter.bi_size; @@ -3222,10 +3239,10 @@ static int btrfs_bio_add_page(struct btrfs_bio_ctrl *bio_ctrl, ASSERT(bio); /* The limit should be calculated when bio_ctrl->bio is allocated */ ASSERT(bio_ctrl->len_to_oe_boundary && bio_ctrl->len_to_stripe_boundary); - if (bio_ctrl->bio_flags != bio_flags) + if (bio_ctrl->compress_type != compress_type) return 0; - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) contig = bio->bi_iter.bi_sector == sector; else contig = bio_end_sector(bio) == sector; @@ -3268,7 +3285,7 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, * The split happens for real compressed bio, which happens in * btrfs_submit_compressed_read/write(). */ - if (bio_ctrl->bio_flags & EXTENT_BIO_COMPRESSED) { + if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { bio_ctrl->len_to_oe_boundary = U32_MAX; bio_ctrl->len_to_stripe_boundary = U32_MAX; return 0; @@ -3308,10 +3325,10 @@ static int calc_bio_boundaries(struct btrfs_bio_ctrl *bio_ctrl, static int alloc_new_bio(struct btrfs_inode *inode, struct btrfs_bio_ctrl *bio_ctrl, struct writeback_control *wbc, - unsigned int opf, + blk_opf_t opf, bio_end_io_t end_io_func, u64 disk_bytenr, u32 offset, u64 file_offset, - unsigned long bio_flags) + enum btrfs_compression_type compress_type) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio; @@ -3322,14 +3339,13 @@ static int alloc_new_bio(struct btrfs_inode *inode, * For compressed page range, its disk_bytenr is always @disk_bytenr * passed in, no matter if we have added any range into previous bio. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) bio->bi_iter.bi_sector = disk_bytenr >> SECTOR_SHIFT; else bio->bi_iter.bi_sector = (disk_bytenr + offset) >> SECTOR_SHIFT; bio_ctrl->bio = bio; - bio_ctrl->bio_flags = bio_flags; + bio_ctrl->compress_type = compress_type; bio->bi_end_io = end_io_func; - bio->bi_private = &inode->io_tree; bio->bi_opf = opf; ret = calc_bio_boundaries(bio_ctrl, inode, file_offset); if (ret < 0) @@ -3386,16 +3402,15 @@ error: * @end_io_func: end_io callback for new bio * @mirror_num: desired mirror to read/write * @prev_bio_flags: flags of previous bio to see if we can merge the current one - * @bio_flags: flags of the current bio to see if we can merge them + * @compress_type: compress type for current bio */ -static int submit_extent_page(unsigned int opf, +static int submit_extent_page(blk_opf_t opf, struct writeback_control *wbc, struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, size_t size, unsigned long pg_offset, bio_end_io_t end_io_func, - int mirror_num, - unsigned long bio_flags, + enum btrfs_compression_type compress_type, bool force_bio_submit) { int ret = 0; @@ -3406,12 +3421,8 @@ static int submit_extent_page(unsigned int opf, ASSERT(pg_offset < PAGE_SIZE && size <= PAGE_SIZE && pg_offset + size <= PAGE_SIZE); - if (force_bio_submit && bio_ctrl->bio) { - ret = submit_one_bio(bio_ctrl->bio, mirror_num, bio_ctrl->bio_flags); - bio_ctrl->bio = NULL; - if (ret < 0) - return ret; - } + if (force_bio_submit) + submit_one_bio(bio_ctrl); while (cur < pg_offset + size) { u32 offset = cur - pg_offset; @@ -3422,7 +3433,7 @@ static int submit_extent_page(unsigned int opf, ret = alloc_new_bio(inode, bio_ctrl, wbc, opf, end_io_func, disk_bytenr, offset, page_offset(page) + cur, - bio_flags); + compress_type); if (ret < 0) return ret; } @@ -3430,14 +3441,14 @@ static int submit_extent_page(unsigned int opf, * We must go through btrfs_bio_add_page() to ensure each * page range won't cross various boundaries. */ - if (bio_flags & EXTENT_BIO_COMPRESSED) + if (compress_type != BTRFS_COMPRESS_NONE) added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr, size - offset, pg_offset + offset, - bio_flags); + compress_type); else added = btrfs_bio_add_page(bio_ctrl, page, disk_bytenr + offset, size - offset, - pg_offset + offset, bio_flags); + pg_offset + offset, compress_type); /* Metadata page range should never be split */ if (!is_data_inode(&inode->vfs_inode)) @@ -3451,11 +3462,7 @@ static int submit_extent_page(unsigned int opf, if (added < size - offset) { /* The bio should contain some page(s) */ ASSERT(bio_ctrl->bio->bi_iter.bi_size); - ret = submit_one_bio(bio_ctrl->bio, mirror_num, - bio_ctrl->bio_flags); - bio_ctrl->bio = NULL; - if (ret < 0) - return ret; + submit_one_bio(bio_ctrl); } cur += added; } @@ -3478,7 +3485,7 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, if (page->mapping) lockdep_assert_held(&page->mapping->private_lock); - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { if (!PagePrivate(page)) attach_page_private(page, eb); else @@ -3513,7 +3520,7 @@ int set_page_extent_mapped(struct page *page) fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); @@ -3530,7 +3537,7 @@ void clear_page_extent_mapped(struct page *page) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (fs_info->sectorsize < PAGE_SIZE) + if (btrfs_is_subpage(fs_info, page)) return btrfs_detach_subpage(fs_info, page); detach_page_private(page); @@ -3569,9 +3576,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, +static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start) + blk_opf_t read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -3603,7 +3610,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, if (zero_offset) { iosize = PAGE_SIZE - zero_offset; memzero_page(page, zero_offset, iosize); - flush_dcache_page(page); } } begin_page_read(fs_info, page); @@ -3618,7 +3624,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, iosize = PAGE_SIZE - pg_offset; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3638,16 +3643,13 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { - this_bio_flag |= EXTENT_BIO_COMPRESSED; - extent_set_compress_type(&this_bio_flag, - em->compress_type); - } + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = em->compress_type; iosize = min(extent_map_end(em) - cur, end - cur + 1); cur_end = min(extent_map_end(em) - 1, end); iosize = ALIGN(iosize, blocksize); - if (this_bio_flag & EXTENT_BIO_COMPRESSED) + if (this_bio_flag != BTRFS_COMPRESS_NONE) disk_bytenr = em->block_start; else disk_bytenr = em->block_start + extent_offset; @@ -3705,7 +3707,6 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, struct extent_state *cached = NULL; memzero_page(page, pg_offset, iosize); - flush_dcache_page(page); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3738,13 +3739,15 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, bio_ctrl, page, disk_bytenr, iosize, - pg_offset, - end_bio_extent_readpage, 0, - this_bio_flag, - force_bio_submit); + pg_offset, end_bio_extent_readpage, + this_bio_flag, force_bio_submit); if (ret) { - unlock_extent(tree, cur, cur + iosize - 1); - end_page_read(page, false, cur, iosize); + /* + * We have to unlock the remaining range, or the page + * will never be unlocked. + */ + unlock_extent(tree, cur, end); + end_page_read(page, false, cur, end + 1 - cur); goto out; } cur = cur + iosize; @@ -3754,6 +3757,26 @@ out: return ret; } +int btrfs_read_folio(struct file *file, struct folio *folio) +{ + struct page *page = &folio->page; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; + struct btrfs_bio_ctrl bio_ctrl = { 0 }; + int ret; + + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); + /* + * If btrfs_do_readpage() failed we will want to submit the assembled + * bio to do the cleanup. + */ + submit_one_bio(&bio_ctrl); + return ret; +} + static inline void contiguous_readpages(struct page *pages[], int nr_pages, u64 start, u64 end, struct extent_map **em_cached, @@ -3772,12 +3795,6 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static void update_nr_written(struct writeback_control *wbc, - unsigned long nr_written) -{ - wbc->nr_to_write -= nr_written; -} - /* * helper for __extent_writepage, doing all of the delayed allocation setup. * @@ -3877,7 +3894,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (fs_info->sectorsize == PAGE_SIZE) { + if (!btrfs_is_subpage(fs_info, page)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -3920,10 +3937,12 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, u64 extent_offset; u64 block_start; struct extent_map *em; + int saved_ret = 0; int ret = 0; int nr = 0; - u32 opf = REQ_OP_WRITE; - const unsigned int write_flags = wbc_to_write_flags(wbc); + enum req_op op = REQ_OP_WRITE; + const blk_opf_t write_flags = wbc_to_write_flags(wbc); + bool has_error = false; bool compressed; ret = btrfs_writepage_cow_fixup(page); @@ -3938,7 +3957,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * we don't want to touch the inode after unlocking the page, * so we update the mapping writeback index now */ - update_nr_written(wbc, 1); + wbc->nr_to_write--; while (cur <= end) { u64 disk_bytenr; @@ -3973,6 +3992,9 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, if (IS_ERR(em)) { btrfs_page_set_error(fs_info, page, cur, end - cur + 1); ret = PTR_ERR_OR_ZERO(em); + has_error = true; + if (!saved_ret) + saved_ret = ret; break; } @@ -3993,7 +4015,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, iosize = min(min(em_end, end + 1), dirty_range_end) - cur; if (btrfs_use_zone_append(inode, em->block_start)) - opf = REQ_OP_ZONE_APPEND; + op = REQ_OP_ZONE_APPEND; free_extent_map(em); em = NULL; @@ -4029,13 +4051,17 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, */ btrfs_page_clear_dirty(fs_info, page, cur, iosize); - ret = submit_extent_page(opf | write_flags, wbc, + ret = submit_extent_page(op | write_flags, wbc, &epd->bio_ctrl, page, disk_bytenr, iosize, cur - page_offset(page), end_bio_extent_writepage, - 0, 0, false); + 0, false); if (ret) { + has_error = true; + if (!saved_ret) + saved_ret = ret; + btrfs_page_set_error(fs_info, page, cur, iosize); if (PageWriteback(page)) btrfs_page_clear_writeback(fs_info, page, cur, @@ -4049,8 +4075,10 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * If we finish without problem, we should not only clear page dirty, * but also empty subpage dirty bits */ - if (!ret) + if (!has_error) btrfs_page_assert_not_dirty(fs_info, page); + else + ret = saved_ret; *nr_ret = nr; return ret; } @@ -4093,10 +4121,8 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, return 0; } - if (page->index == end_index) { + if (page->index == end_index) memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); - flush_dcache_page(page); - } ret = set_page_extent_mapped(page); if (ret < 0) { @@ -4181,9 +4207,6 @@ void wait_on_extent_buffer_writeback(struct extent_buffer *eb) static void end_extent_buffer_writeback(struct extent_buffer *eb) { - if (test_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags)) - btrfs_zone_finish_endio(eb->fs_info, eb->start, eb->len); - clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -4203,14 +4226,12 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb struct extent_page_data *epd) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages, failed_page_nr; + int i, num_pages; int flush = 0; int ret = 0; if (!btrfs_try_tree_write_lock(eb)) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + submit_write_bio(epd, 0); flush = 1; btrfs_tree_lock(eb); } @@ -4220,9 +4241,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!epd->sync_io) return 0; if (!flush) { - ret = flush_write_bio(epd); - if (ret < 0) - return ret; + submit_write_bio(epd, 0); flush = 1; } while (1) { @@ -4260,7 +4279,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb * Subpage metadata doesn't use page locking at all, so we can skip * the page locking. */ - if (!ret || fs_info->sectorsize < PAGE_SIZE) + if (!ret || fs_info->nodesize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4269,14 +4288,7 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb if (!trylock_page(p)) { if (!flush) { - int err; - - err = flush_write_bio(epd); - if (err < 0) { - ret = err; - failed_page_nr = i; - goto err_unlock; - } + submit_write_bio(epd, 0); flush = 1; } lock_page(p); @@ -4284,25 +4296,6 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb } return ret; -err_unlock: - /* Unlock already locked pages */ - for (i = 0; i < failed_page_nr; i++) - unlock_page(eb->pages[i]); - /* - * Clear EXTENT_BUFFER_WRITEBACK and wake up anyone waiting on it. - * Also set back EXTENT_BUFFER_DIRTY so future attempts to this eb can - * be made and undo everything done before. - */ - btrfs_tree_lock(eb); - spin_lock(&eb->refs_lock); - set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - end_extent_buffer_writeback(eb); - spin_unlock(&eb->refs_lock); - percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, - fs_info->dirty_metadata_batch); - btrfs_clear_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); - btrfs_tree_unlock(eb); - return ret; } static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) @@ -4420,7 +4413,7 @@ static void end_bio_subpage_eb_writepage(struct bio *bio) struct bvec_iter_all iter_all; fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->nodesize < PAGE_SIZE); ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { @@ -4537,7 +4530,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, { struct btrfs_fs_info *fs_info = eb->fs_info; struct page *page = eb->pages[0]; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); bool no_dirty_ebs = false; int ret; @@ -4556,7 +4549,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, &epd->bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_subpage_eb_writepage, 0, 0, false); + end_bio_subpage_eb_writepage, 0, false); if (ret) { btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); set_btree_ioerr(page, eb); @@ -4572,7 +4565,7 @@ static int write_one_subpage_eb(struct extent_buffer *eb, * dirty anymore, we have submitted a page. Update nr_written in wbc. */ if (no_dirty_ebs) - update_nr_written(wbc, 1); + wbc->nr_to_write--; return ret; } @@ -4582,7 +4575,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, { u64 disk_bytenr = eb->start; int i, num_pages; - unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + blk_opf_t write_flags = wbc_to_write_flags(wbc); int ret = 0; prepare_eb_write(eb); @@ -4597,7 +4590,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, &epd->bio_ctrl, p, disk_bytenr, PAGE_SIZE, 0, end_bio_extent_buffer_writepage, - 0, 0, false); + 0, false); if (ret) { set_btree_ioerr(p, eb); if (PageWriteback(p)) @@ -4608,7 +4601,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, break; } disk_bytenr += PAGE_SIZE; - update_nr_written(wbc, 1); + wbc->nr_to_write--; unlock_page(p); } @@ -4711,7 +4704,7 @@ static int submit_eb_subpage(struct page *page, cleanup: /* We hit error, end bio for the submitted extent buffers */ - end_write_bio(epd, ret); + submit_write_bio(epd, ret); return ret; } @@ -4747,7 +4740,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc, epd); spin_lock(&mapping->private_lock); @@ -4803,8 +4796,7 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, /* * Implies write in zoned mode. Mark the last eb in a block group. */ - if (cache->seq_zone && eb->start + eb->len == cache->zone_capacity) - set_bit(EXTENT_BUFFER_ZONE_FINISH, &eb->bflags); + btrfs_schedule_zone_finish_bg(cache, eb); btrfs_put_block_group(cache); } ret = write_one_eb(eb, wbc, epd); @@ -4891,10 +4883,6 @@ retry: index = 0; goto retry; } - if (ret < 0) { - end_write_bio(&epd, ret); - goto out; - } /* * If something went wrong, don't allow any metadata write bio to be * submitted. @@ -4921,14 +4909,16 @@ retry: * Now such dirty tree block will not be cleaned by any dirty * extent io tree. Thus we don't want to submit such wild eb * if the fs already has error. + * + * We can get ret > 0 from submit_extent_page() indicating how many ebs + * were submitted. Reset it to 0 to avoid false alerts for the caller. */ - if (!BTRFS_FS_ERROR(fs_info)) { - ret = flush_write_bio(&epd); - } else { + if (ret > 0) + ret = 0; + if (!ret && BTRFS_FS_ERROR(fs_info)) ret = -EROFS; - end_write_bio(&epd, ret); - } -out: + submit_write_bio(&epd, ret); + btrfs_zoned_meta_io_unlock(fs_info); return ret; } @@ -5031,8 +5021,7 @@ retry: * tmpfs file mapping */ if (!trylock_page(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); + submit_write_bio(epd, 0); lock_page(page); } @@ -5042,10 +5031,8 @@ retry: } if (wbc->sync_mode != WB_SYNC_NONE) { - if (PageWriteback(page)) { - ret = flush_write_bio(epd); - BUG_ON(ret < 0); - } + if (PageWriteback(page)) + submit_write_bio(epd, 0); wait_on_page_writeback(page); } @@ -5085,9 +5072,8 @@ retry: * page in our current bio, and thus deadlock, so flush the * write bio here. */ - ret = flush_write_bio(epd); - if (!ret) - goto retry; + submit_write_bio(epd, 0); + goto retry; } if (wbc->range_cyclic || (wbc->nr_to_write > 0 && range_whole)) @@ -5097,27 +5083,6 @@ retry: return ret; } -int extent_write_full_page(struct page *page, struct writeback_control *wbc) -{ - int ret; - struct extent_page_data epd = { - .bio_ctrl = { 0 }, - .extent_locked = 0, - .sync_io = wbc->sync_mode == WB_SYNC_ALL, - }; - - ret = __extent_writepage(page, wbc, &epd); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - - ret = flush_write_bio(&epd); - ASSERT(ret <= 0); - return ret; -} - /* * Submit the pages in the range to bio for call sites which delalloc range has * already been ran (aka, ordered extent inserted) and all pages are still @@ -5175,10 +5140,7 @@ int extent_write_locked_range(struct inode *inode, u64 start, u64 end) cur = cur_end + 1; } - if (!found_error) - ret = flush_write_bio(&epd); - else - end_write_bio(&epd, ret); + submit_write_bio(&epd, found_error ? ret : 0); wbc_detach_inode(&wbc_writepages); if (found_error) @@ -5203,13 +5165,8 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); + submit_write_bio(&epd, ret); btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); - ASSERT(ret <= 0); - if (ret < 0) { - end_write_bio(&epd, ret); - return ret; - } - ret = flush_write_bio(&epd); return ret; } @@ -5231,11 +5188,7 @@ void extent_readahead(struct readahead_control *rac) if (em_cached) free_extent_map(em_cached); - - if (bio_ctrl.bio) { - if (submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags)) - return; - } + submit_one_bio(&bio_ctrl); } /* @@ -5271,7 +5224,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, } /* - * a helper for releasepage, this tests for areas of the page that + * a helper for release_folio, this tests for areas of the page that * are locked or under IO and drops the related state bits if it is safe * to drop the page. */ @@ -5307,7 +5260,7 @@ static int try_release_extent_state(struct extent_io_tree *tree, } /* - * a helper for releasepage. As long as there are no locked extents + * a helper for release_folio. As long as there are no locked extents * in the range corresponding to the page, both state records and extent * map records are removed */ @@ -5804,7 +5757,7 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag return; } - if (fs_info->sectorsize == PAGE_SIZE) { + if (fs_info->nodesize >= PAGE_SIZE) { /* * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race @@ -5911,9 +5864,9 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { int i; - struct page *p; struct extent_buffer *new; int num_pages = num_extent_pages(src); + int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); if (new == NULL) @@ -5926,22 +5879,23 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); + memset(new->pages, 0, sizeof(*new->pages) * num_pages); + ret = btrfs_alloc_page_array(num_pages, new->pages); + if (ret) { + btrfs_release_extent_buffer(new); + return NULL; + } + for (i = 0; i < num_pages; i++) { int ret; + struct page *p = new->pages[i]; - p = alloc_page(GFP_NOFS); - if (!p) { - btrfs_release_extent_buffer(new); - return NULL; - } ret = attach_extent_buffer_page(new, p, NULL); if (ret < 0) { - put_page(p); btrfs_release_extent_buffer(new); return NULL; } WARN_ON(PageDirty(p)); - new->pages[i] = p; copy_page(page_address(p), page_address(src->pages[i])); } set_extent_buffer_uptodate(new); @@ -5955,31 +5909,36 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, struct extent_buffer *eb; int num_pages; int i; + int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; num_pages = num_extent_pages(eb); + ret = btrfs_alloc_page_array(num_pages, eb->pages); + if (ret) + goto err; + for (i = 0; i < num_pages; i++) { - int ret; + struct page *p = eb->pages[i]; - eb->pages[i] = alloc_page(GFP_NOFS); - if (!eb->pages[i]) - goto err; - ret = attach_extent_buffer_page(eb, eb->pages[i], NULL); + ret = attach_extent_buffer_page(eb, p, NULL); if (ret < 0) goto err; } + set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; err: - for (; i > 0; i--) { - detach_extent_buffer_page(eb, eb->pages[i - 1]); - __free_page(eb->pages[i - 1]); + for (i = 0; i < num_pages; i++) { + if (eb->pages[i]) { + detach_extent_buffer_page(eb, eb->pages[i]); + __free_page(eb->pages[i]); + } } __free_extent_buffer(eb); return NULL; @@ -6001,10 +5960,10 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or - * calling releasepage when the tree reference is the only reference. + * calling release_folio when the tree reference is the only reference. * * In both cases, care is taken to ensure that the extent_buffer's - * pages are not under io. However, releasepage can be concurrently + * pages are not under io. However, release_folio can be concurrently * called with creating new references, which is prone to race * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. @@ -6124,7 +6083,7 @@ static struct extent_buffer *grab_extent_buffer( * don't try to insert two ebs for the same bytenr. So here we always * return NULL and just continue. */ - if (fs_info->sectorsize < PAGE_SIZE) + if (fs_info->nodesize < PAGE_SIZE) return NULL; /* Page not yet attached to an extent buffer */ @@ -6146,6 +6105,30 @@ static struct extent_buffer *grab_extent_buffer( return NULL; } +static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) +{ + if (!IS_ALIGNED(start, fs_info->sectorsize)) { + btrfs_err(fs_info, "bad tree block start %llu", start); + return -EINVAL; + } + + if (fs_info->nodesize < PAGE_SIZE && + offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + btrfs_err(fs_info, + "tree block crosses page boundary, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + if (fs_info->nodesize >= PAGE_SIZE && + !PAGE_ALIGNED(start)) { + btrfs_err(fs_info, + "tree block is not page aligned, start %llu nodesize %u", + start, fs_info->nodesize); + return -EINVAL; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { @@ -6160,10 +6143,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, int uptodate = 1; int ret; - if (!IS_ALIGNED(start, fs_info->sectorsize)) { - btrfs_err(fs_info, "bad tree block start %llu", start); + if (check_eb_alignment(fs_info, start)) return ERR_PTR(-EINVAL); - } #if BITS_PER_LONG == 32 if (start >= MAX_LFS_FILESIZE) { @@ -6176,14 +6157,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_warn_32bit_limit(fs_info); #endif - if (fs_info->sectorsize < PAGE_SIZE && - offset_in_page(start) + len > PAGE_SIZE) { - btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %lu", - start, len); - return ERR_PTR(-EINVAL); - } - eb = find_extent_buffer(fs_info, start); if (eb) return eb; @@ -6213,7 +6186,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * page, but it may change in the future for 16K page size * support, so we still preallocate the memory in the loop. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); @@ -6257,7 +6230,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, /* * We can't unlock the pages just yet since the extent buffer * hasn't been properly inserted in the radix tree, this - * opens a race with btree_releasepage which can free a page + * opens a race with btree_release_folio which can free a page * while we are still filling in all pages for the buffer and * we could crash. */ @@ -6289,7 +6262,7 @@ again: /* * Now it's safe to unlock the pages because any calls to - * btree_releasepage will correctly detect that a page belongs to a + * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ for (i = 0; i < num_pages; i++) @@ -6432,7 +6405,7 @@ void clear_extent_buffer_dirty(const struct extent_buffer *eb) int num_pages; struct page *page; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); num_pages = num_extent_pages(eb); @@ -6464,7 +6437,7 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + bool subpage = eb->fs_info->nodesize < PAGE_SIZE; /* * For subpage case, we can have other extent buffers in the @@ -6504,9 +6477,18 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - if (page) - btrfs_page_clear_uptodate(fs_info, page, - eb->start, eb->len); + if (!page) + continue; + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + ClearPageUptodate(page); + else + btrfs_subpage_clear_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6521,7 +6503,16 @@ void set_extent_buffer_uptodate(struct extent_buffer *eb) num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; - btrfs_page_set_uptodate(fs_info, page, eb->start, eb->len); + + /* + * This is special handling for metadata subpage, as regular + * btrfs_is_subpage() can not handle cloned/dummy metadata. + */ + if (fs_info->nodesize >= PAGE_SIZE) + SetPageUptodate(page); + else + btrfs_subpage_set_uptodate(fs_info, page, eb->start, + eb->len); } } @@ -6531,7 +6522,9 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, struct btrfs_fs_info *fs_info = eb->fs_info; struct extent_io_tree *io_tree; struct page *page = eb->pages[0]; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; int ret = 0; ASSERT(!test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags)); @@ -6563,11 +6556,10 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, btrfs_subpage_clear_error(fs_info, page, eb->start, eb->len); btrfs_subpage_start_reader(fs_info, page, eb->start, eb->len); - ret = submit_extent_page(REQ_OP_READ | REQ_META, NULL, &bio_ctrl, + ret = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, eb->start, eb->len, eb->start - page_offset(page), - end_bio_extent_readpage, mirror_num, 0, - true); + end_bio_extent_readpage, 0, true); if (ret) { /* * In the endio function, if we hit something wrong we will @@ -6576,14 +6568,7 @@ static int read_extent_buffer_subpage(struct extent_buffer *eb, int wait, */ atomic_dec(&eb->io_pages); } - if (bio_ctrl.bio) { - int tmp; - - tmp = submit_one_bio(bio_ctrl.bio, mirror_num, 0); - bio_ctrl.bio = NULL; - if (tmp < 0) - return tmp; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6603,7 +6588,9 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) int all_uptodate = 1; int num_pages; unsigned long num_reads = 0; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; + struct btrfs_bio_ctrl bio_ctrl = { + .mirror_num = mirror_num, + }; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -6616,7 +6603,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) if (unlikely(test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))) return -EIO; - if (eb->fs_info->sectorsize < PAGE_SIZE) + if (eb->fs_info->nodesize < PAGE_SIZE) return read_extent_buffer_subpage(eb, wait, mirror_num); num_pages = num_extent_pages(eb); @@ -6659,7 +6646,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); /* - * It is possible for releasepage to clear the TREE_REF bit before we + * It is possible for release_folio to clear the TREE_REF bit before we * set io_pages. See check_buffer_tree_ref for a more detailed comment. */ check_buffer_tree_ref(eb); @@ -6674,10 +6661,10 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + err = submit_extent_page(REQ_OP_READ, NULL, &bio_ctrl, page, page_offset(page), PAGE_SIZE, 0, end_bio_extent_readpage, - mirror_num, 0, false); + 0, false); if (err) { /* * We failed to submit the bio so it's the @@ -6694,12 +6681,7 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } } - if (bio_ctrl.bio) { - err = submit_one_bio(bio_ctrl.bio, mirror_num, bio_ctrl.bio_flags); - bio_ctrl.bio = NULL; - if (err) - return err; - } + submit_one_bio(&bio_ctrl); if (ret || wait != WAIT_COMPLETE) return ret; @@ -6871,7 +6853,7 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, * would have !PageUptodate && !PageError, as we clear PageError before * reading. */ - if (fs_info->sectorsize < PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE) { bool uptodate, error; uptodate = btrfs_subpage_test_uptodate(fs_info, page, @@ -6973,7 +6955,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, ASSERT(dst->len == src->len); - if (dst->fs_info->sectorsize == PAGE_SIZE) { + if (dst->fs_info->nodesize >= PAGE_SIZE) { num_pages = num_extent_pages(dst); for (i = 0; i < num_pages; i++) copy_page(page_address(dst->pages[i]), @@ -6982,7 +6964,7 @@ void copy_extent_buffer_full(const struct extent_buffer *dst, size_t src_offset = get_eb_offset_in_page(src, 0); size_t dst_offset = get_eb_offset_in_page(dst, 0); - ASSERT(src->fs_info->sectorsize < PAGE_SIZE); + ASSERT(src->fs_info->nodesize < PAGE_SIZE); memcpy(page_address(dst->pages[0]) + dst_offset, page_address(src->pages[0]) + src_offset, src->len); @@ -7375,7 +7357,7 @@ int try_release_extent_buffer(struct page *page) { struct extent_buffer *eb; - if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 151e9da5da2d..4bc72a87b9a9 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -7,15 +7,9 @@ #include <linux/refcount.h> #include <linux/fiemap.h> #include <linux/btrfs_tree.h> +#include "compression.h" #include "ulist.h" -/* - * flags for bio submission. The high bits indicate the compression - * type for this bio - */ -#define EXTENT_BIO_COMPRESSED 1 -#define EXTENT_BIO_FLAG_SHIFT 16 - enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, @@ -32,7 +26,6 @@ enum { /* write IO error */ EXTENT_BUFFER_WRITE_ERR, EXTENT_BUFFER_NO_CHECK, - EXTENT_BUFFER_ZONE_FINISH, }; /* these are flags for __process_pages_contig */ @@ -64,6 +57,7 @@ enum { #define BITMAP_LAST_BYTE_MASK(nbits) \ (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) +struct btrfs_bio; struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; @@ -71,9 +65,9 @@ struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; -typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, +typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio, int mirror_num, - unsigned long bio_flags); + enum btrfs_compression_type compress_type); typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode, struct bio *bio, u64 dio_file_offset); @@ -103,17 +97,6 @@ struct extent_buffer { }; /* - * Structure to record info about the bio being assembled, and other info like - * how many bytes are there before stripe/ordered extent boundary. - */ -struct btrfs_bio_ctrl { - struct bio *bio; - unsigned long bio_flags; - u32 len_to_stripe_boundary; - u32 len_to_oe_boundary; -}; - -/* * Structure to record how many bytes and which ranges are set/cleared */ struct extent_changeset { @@ -158,32 +141,12 @@ static inline void extent_changeset_free(struct extent_changeset *changeset) kfree(changeset); } -static inline void extent_set_compress_type(unsigned long *bio_flags, - int compress_type) -{ - *bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT; -} - -static inline int extent_compress_type(unsigned long bio_flags) -{ - return bio_flags >> EXTENT_BIO_FLAG_SHIFT; -} - struct extent_map_tree; -typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); - int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags); -int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, - struct btrfs_bio_ctrl *bio_ctrl, - unsigned int read_flags, u64 *prev_em_start); -int extent_write_full_page(struct page *page, struct writeback_control *wbc); +int btrfs_read_folio(struct file *file, struct folio *folio); int extent_write_locked_range(struct inode *inode, u64 start, u64 end); int extent_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -277,8 +240,9 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct page *locked_page, u32 bits_to_clear, unsigned long page_ops); + +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); struct bio *btrfs_bio_alloc(unsigned int nr_iovecs); -struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size); void end_extent_writepage(struct page *page, int err, u64 start, u64 end); @@ -297,15 +261,13 @@ struct io_failure_record { u64 start; u64 len; u64 logical; - unsigned long bio_flags; int this_mirror; int failed_mirror; + int num_copies; }; -int btrfs_repair_one_sector(struct inode *inode, - struct bio *failed_bio, u32 bio_offset, - struct page *page, unsigned int pgoff, - u64 start, int failed_mirror, +int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio, + u32 bio_offset, struct page *page, unsigned int pgoff, submit_bio_hook_t *submit_bio_hook); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 380054c94e4b..66c822182ecc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1307,11 +1307,12 @@ static int prepare_uptodate_page(struct inode *inode, struct page *page, u64 pos, bool force_uptodate) { + struct folio *folio = page_folio(page); int ret = 0; if (((pos & (PAGE_SIZE - 1)) || force_uptodate) && !PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); + ret = btrfs_read_folio(NULL, folio); if (ret) return ret; lock_page(page); @@ -1321,8 +1322,8 @@ static int prepare_uptodate_page(struct inode *inode, } /* - * Since btrfs_readpage() will unlock the page before it - * returns, there is a window where btrfs_releasepage() can be + * Since btrfs_read_folio() will unlock the folio before it + * returns, there is a window where btrfs_release_folio() can be * called to release the page. Here we check both inode * mapping and PagePrivate() to make sure the page was not * released. @@ -1460,8 +1461,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, return ret; } -static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes, bool nowait) +/* + * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) + * + * @pos: File offset. + * @write_bytes: The length to write, will be updated to the nocow writeable + * range. + * + * This function will flush ordered extents in the range to ensure proper + * nocow checks. + * + * Return: + * > 0 If we can nocow, and updates @write_bytes. + * 0 If we can't do a nocow write. + * -EAGAIN If we can't do a nocow write because snapshoting of the inode's + * root is in progress. + * < 0 If an error happened. + * + * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0. + */ +int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, + size_t *write_bytes) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_root *root = inode->root; @@ -1472,7 +1492,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) return 0; - if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock)) + if (!btrfs_drew_try_write_lock(&root->snapshot_lock)) return -EAGAIN; lockstart = round_down(pos, fs_info->sectorsize); @@ -1480,71 +1500,21 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos, fs_info->sectorsize) - 1; num_bytes = lockend - lockstart + 1; - if (nowait) { - struct btrfs_ordered_extent *ordered; - - if (!try_lock_extent(&inode->io_tree, lockstart, lockend)) - return -EAGAIN; - - ordered = btrfs_lookup_ordered_range(inode, lockstart, - num_bytes); - if (ordered) { - btrfs_put_ordered_extent(ordered); - ret = -EAGAIN; - goto out_unlock; - } - } else { - btrfs_lock_and_flush_ordered_range(inode, lockstart, - lockend, NULL); - } - + btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL); ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, NULL, NULL, NULL, false); if (ret <= 0) { ret = 0; - if (!nowait) - btrfs_drew_write_unlock(&root->snapshot_lock); + btrfs_drew_write_unlock(&root->snapshot_lock); } else { *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); } -out_unlock: unlock_extent(&inode->io_tree, lockstart, lockend); return ret; } -static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, true); -} - -/* - * Check if we can do nocow write into the range [@pos, @pos + @write_bytes) - * - * @pos: File offset - * @write_bytes: The length to write, will be updated to the nocow writeable - * range - * - * This function will flush ordered extents in the range to ensure proper - * nocow checks. - * - * Return: - * >0 and update @write_bytes if we can do nocow write - * 0 if we can't do nocow write - * -EAGAIN if we can't get the needed lock or there are ordered extents - * for * (nowait == true) case - * <0 if other error happened - * - * NOTE: Callers need to release the lock by btrfs_check_nocow_unlock(). - */ -int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, - size_t *write_bytes) -{ - return check_can_nocow(inode, pos, write_bytes, false); -} - void btrfs_check_nocow_unlock(struct btrfs_inode *inode) { btrfs_drew_write_unlock(&inode->root->snapshot_lock); @@ -1579,20 +1549,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from, loff_t oldsize; loff_t start_pos; - if (iocb->ki_flags & IOCB_NOWAIT) { - size_t nocow_bytes = count; - - /* We will allocate space in case nodatacow is not set, so bail */ - if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0) - return -EAGAIN; - /* - * There are holes in the range or parts of the range that must - * be COWed (shared extents, RO block groups, etc), so just bail - * out. - */ - if (nocow_bytes < count) - return -EAGAIN; - } + /* + * Quickly bail out on NOWAIT writes if we don't have the nodatacow or + * prealloc flags, as without those flags we always have to COW. We will + * later check if we can really COW into the target range (using + * can_nocow_extent() at btrfs_get_blocks_direct_write()). + */ + if ((iocb->ki_flags & IOCB_NOWAIT) && + !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + return -EAGAIN; current->backing_dev_info = inode_to_bdi(inode); ret = file_remove_privs(file); @@ -1720,7 +1685,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb, WARN_ON(reserve_bytes == 0); ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), reserve_bytes, - reserve_bytes); + reserve_bytes, false); if (ret) { if (!only_release_metadata) btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -1883,7 +1848,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) { - const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC); struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -1937,15 +1901,6 @@ relock: } /* - * We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw() - * calls generic_write_sync() (through iomap_dio_complete()), because - * that results in calling fsync (btrfs_sync_file()) which will try to - * lock the inode in exclusive/write mode. - */ - if (is_sync_write) - iocb->ki_flags &= ~IOCB_DSYNC; - - /* * The iov_iter can be mapped to the same file range we are writing to. * If that's the case, then we will deadlock in the iomap code, because * it first calls our callback btrfs_dio_iomap_begin(), which will create @@ -1965,8 +1920,7 @@ relock: */ again: from->nofault = true; - err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, written); + err = btrfs_dio_rw(iocb, from, written); from->nofault = false; /* No increment (+=) because iomap returns a cumulative value. */ @@ -2001,17 +1955,24 @@ again: btrfs_inode_unlock(inode, ilock_flags); /* - * Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do - * the fsync (call generic_write_sync()). + * If 'err' is -ENOTBLK or we have not written all data, then it means + * we must fallback to buffered IO. */ - if (is_sync_write) - iocb->ki_flags |= IOCB_DSYNC; - - /* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */ if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from)) goto out; buffered: + /* + * If we are in a NOWAIT context, then return -EAGAIN to signal the caller + * it must retry the operation in a context where blocking is acceptable, + * since we currently don't have NOWAIT semantics support for buffered IO + * and may block there for many reasons (reserving space for example). + */ + if (iocb->ki_flags & IOCB_NOWAIT) { + err = -EAGAIN; + goto out; + } + pos = iocb->ki_pos; written_buffered = btrfs_buffered_write(iocb, from); if (written_buffered < 0) { @@ -2074,7 +2035,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, struct file *file = iocb->ki_filp; struct btrfs_inode *inode = BTRFS_I(file_inode(file)); ssize_t num_written, num_sync; - const bool sync = iocb->ki_flags & IOCB_DSYNC; + const bool sync = iocb_is_dsync(iocb); /* * If the fs flips readonly due to some impossible error, although we @@ -2094,9 +2055,11 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from, num_written = btrfs_encoded_write(iocb, from, encoded); num_sync = encoded->len; } else if (iocb->ki_flags & IOCB_DIRECT) { - num_written = num_sync = btrfs_direct_write(iocb, from); + num_written = btrfs_direct_write(iocb, from); + num_sync = num_written; } else { - num_written = num_sync = btrfs_buffered_write(iocb, from); + num_written = btrfs_buffered_write(iocb, from); + num_sync = num_written; } btrfs_set_inode_last_sub_trans(inode); @@ -2344,7 +2307,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } /* we've logged all the items and now have a consistent @@ -2359,25 +2322,62 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - if (ret != BTRFS_NO_LOG_SYNC) { + if (ret == BTRFS_NO_LOG_SYNC) { + ret = btrfs_end_transaction(trans); + goto out; + } + + /* We successfully logged the inode, attempt to sync the log. */ + if (!ret) { + ret = btrfs_sync_log(trans, root, &ctx); if (!ret) { - ret = btrfs_sync_log(trans, root, &ctx); - if (!ret) { - ret = btrfs_end_transaction(trans); - goto out; - } - } - if (!full_sync) { - ret = btrfs_wait_ordered_range(inode, start, len); - if (ret) { - btrfs_end_transaction(trans); - goto out; - } + ret = btrfs_end_transaction(trans); + goto out; } - ret = btrfs_commit_transaction(trans); - } else { + } + + /* + * At this point we need to commit the transaction because we had + * btrfs_need_log_full_commit() or some other error. + * + * If we didn't do a full sync we have to stop the trans handle, wait on + * the ordered extents, start it again and commit the transaction. If + * we attempt to wait on the ordered extents here we could deadlock with + * something like fallocate() that is holding the extent lock trying to + * start a transaction while some other thread is trying to commit the + * transaction while we (fsync) are currently holding the transaction + * open. + */ + if (!full_sync) { ret = btrfs_end_transaction(trans); + if (ret) + goto out; + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) + goto out; + + /* + * This is safe to use here because we're only interested in + * making sure the transaction that had the ordered extents is + * committed. We aren't waiting on anything past this point, + * we're purely getting the transaction and committing it. + */ + trans = btrfs_attach_transaction_barrier(root); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + + /* + * We committed the transaction and there's no currently + * running transaction, this means everything we care + * about made it to disk and we are done. + */ + if (ret == -ENOENT) + ret = 0; + goto out; + } } + + ret = btrfs_commit_transaction(trans); out: ASSERT(list_empty(&ctx.list)); err = file_check_and_advance_wb_err(file); @@ -2401,7 +2401,7 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) { struct address_space *mapping = filp->f_mapping; - if (!mapping->a_ops->readpage) + if (!mapping->a_ops->read_folio) return -ENOEXEC; file_accessed(filp); @@ -2570,10 +2570,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) return ret; } -static int btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, + const u64 lockend, + struct extent_state **cached_state) { /* * For subpage case, if the range is not at page boundary, we could @@ -2587,40 +2587,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; while (1) { - struct btrfs_ordered_extent *ordered; - int ret; - truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - lockend); - /* - * We need to make sure we have no ordered extents in this range - * and nobody raced in and read a page in this range, if we did - * we need to try again. + * We can't have ordered extents in the range, nor dirty/writeback + * pages, because we have locked the inode's VFS lock in exclusive + * mode, we have locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range and we have waited + * for any ordered extents in the range to complete. + * We can race with anyone reading pages from this range, so after + * locking the range check if we have pages in the range, and if + * we do, unlock the range and retry. */ - if ((!ordered || - (ordered->file_offset + ordered->num_bytes <= lockstart || - ordered->file_offset > lockend)) && - !filemap_range_has_page(inode->i_mapping, - page_lockstart, page_lockend)) { - if (ordered) - btrfs_put_ordered_extent(ordered); + if (!filemap_range_has_page(inode->i_mapping, page_lockstart, + page_lockend)) break; - } - if (ordered) - btrfs_put_ordered_extent(ordered); + unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ret = btrfs_wait_ordered_range(inode, lockstart, - lockend - lockstart + 1); - if (ret) - return ret; } - return 0; + + btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); } static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, @@ -2744,7 +2733,7 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, goto out; } rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; /* * 1 - update the inode @@ -2766,7 +2755,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); + if (WARN_ON(ret)) + goto out_trans; trans->block_rsv = rsv; cur_offset = start; @@ -2850,6 +2840,25 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, extent_info->file_offset += replace_len; } + /* + * We are releasing our handle on the transaction, balance the + * dirty pages of the btree inode and flush delayed items, and + * then get a new transaction handle, which may now point to a + * new transaction in case someone else may have committed the + * transaction we used to replace/drop file extent items. So + * bump the inode's iversion and update mtime and ctime except + * if we are called from a dedupe context. This is because a + * power failure/crash may happen after the transaction is + * committed and before we finish replacing/dropping all the + * file extent items we need. + */ + inode_inc_iversion(&inode->vfs_inode); + + if (!extent_info || extent_info->update_times) { + inode->vfs_inode.i_mtime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = inode->vfs_inode.i_mtime; + } + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2866,7 +2875,8 @@ int btrfs_replace_file_extents(struct btrfs_inode *inode, ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv, min_size, false); - BUG_ON(ret); /* shouldn't happen */ + if (WARN_ON(ret)) + break; trans->block_rsv = rsv; cur_offset = drop_args.drop_end; @@ -2976,11 +2986,12 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) bool truncated_block = false; bool updated_inode = false; + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); + ret = btrfs_wait_ordered_range(inode, offset, len); if (ret) - return ret; + goto out_only_mutex; - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -3072,10 +3083,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) goto out_only_mutex; } - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out_only_mutex; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state); path = btrfs_alloc_path(); if (!path) { @@ -3091,7 +3099,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) ASSERT(trans != NULL); inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); updated_inode = true; btrfs_end_transaction(trans); @@ -3237,8 +3246,6 @@ static int btrfs_zero_range(struct inode *inode, u64 bytes_to_reserve = 0; bool space_reserved = false; - inode_dio_wait(inode); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start, alloc_end - alloc_start); if (IS_ERR(em)) { @@ -3368,10 +3375,8 @@ reserve_space: if (ret < 0) goto out; space_reserved = true; - ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend, - &cached_state); - if (ret) - goto out; + btrfs_punch_hole_lock_range(inode, lockstart, lockend, + &cached_state); ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { @@ -3417,6 +3422,9 @@ static long btrfs_fallocate(struct file *file, int mode, u64 alloc_hint = 0; u64 locked_end; u64 actual_end = 0; + u64 data_space_needed = 0; + u64 data_space_reserved = 0; + u64 qgroup_reserved = 0; struct extent_map *em; int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; @@ -3437,18 +3445,6 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_PUNCH_HOLE) return btrfs_punch_hole(file, offset, len); - /* - * Only trigger disk allocation, don't trigger qgroup reserve - * - * For qgroup space, it will be checked later. - */ - if (!(mode & FALLOC_FL_ZERO_RANGE)) { - ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), - alloc_end - alloc_start); - if (ret < 0) - return ret; - } - btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { @@ -3485,8 +3481,12 @@ static long btrfs_fallocate(struct file *file, int mode, } /* - * wait for ordered IO before we have any locks. We'll loop again - * below with the locks held. + * We have locked the inode at the VFS level (in exclusive mode) and we + * have locked the i_mmap_lock lock (in exclusive mode). Now before + * locking the file range, flush all dealloc in the range and wait for + * all ordered extents in the range to complete. After this we can lock + * the file range and, due to the previous locking we did, we know there + * can't be more delalloc or ordered extents in the range. */ ret = btrfs_wait_ordered_range(inode, alloc_start, alloc_end - alloc_start); @@ -3500,38 +3500,10 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - while (1) { - struct btrfs_ordered_extent *ordered; + lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); - /* the extent lock is ordered inside the running - * transaction - */ - lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - locked_end); - - if (ordered && - ordered->file_offset + ordered->num_bytes > alloc_start && - ordered->file_offset < alloc_end) { - btrfs_put_ordered_extent(ordered); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - alloc_start, locked_end, - &cached_state); - /* - * we can't wait on the range with the transaction - * running or with the extent lock held - */ - ret = btrfs_wait_ordered_range(inode, alloc_start, - alloc_end - alloc_start); - if (ret) - goto out; - } else { - if (ordered) - btrfs_put_ordered_extent(ordered); - break; - } - } + btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); /* First, check if we exceed the qgroup limit */ INIT_LIST_HEAD(&reserve_list); @@ -3548,48 +3520,64 @@ static long btrfs_fallocate(struct file *file, int mode, if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { - ret = add_falloc_range(&reserve_list, cur_offset, - last_byte - cur_offset); + const u64 range_len = last_byte - cur_offset; + + ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), - &data_reserved, cur_offset, - last_byte - cur_offset); + &data_reserved, cur_offset, range_len); if (ret < 0) { - cur_offset = last_byte; free_extent_map(em); break; } - } else { - /* - * Do not need to reserve unwritten extent for this - * range, free reserved data space first, otherwise - * it'll result in false ENOSPC error. - */ - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, cur_offset, - last_byte - cur_offset); + qgroup_reserved += range_len; + data_space_needed += range_len; } free_extent_map(em); cur_offset = last_byte; } + if (!ret && data_space_needed > 0) { + /* + * We are safe to reserve space here as we can't have delalloc + * in the range, see above. + */ + ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode), + data_space_needed); + if (!ret) + data_space_reserved = data_space_needed; + } + /* * If ret is still 0, means we're OK to fallocate. * Or just cleanup the list and exit. */ list_for_each_entry_safe(range, tmp, &reserve_list, list) { - if (!ret) + if (!ret) { ret = btrfs_prealloc_file_range(inode, mode, range->start, range->len, i_blocksize(inode), offset + len, &alloc_hint); - else + /* + * btrfs_prealloc_file_range() releases space even + * if it returns an error. + */ + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (data_space_reserved > 0) { btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, range->start, - range->len); + data_reserved, range->start, + range->len); + data_space_reserved -= range->len; + qgroup_reserved -= range->len; + } else if (qgroup_reserved > 0) { + btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, + range->start, range->len); + qgroup_reserved -= range->len; + } list_del(&range->list); kfree(range); } @@ -3606,10 +3594,6 @@ out_unlock: &cached_state); out: btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); - /* Let go of our reservation. */ - if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) - btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, - cur_offset, alloc_end - cur_offset); extent_changeset_free(data_reserved); return ret; } @@ -3767,8 +3751,7 @@ again: */ pagefault_disable(); to->nofault = true; - ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - IOMAP_DIO_PARTIAL, read); + ret = btrfs_dio_rw(iocb, to, read); to->nofault = false; pagefault_enable(); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 01a408db5683..996da650ecdc 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -465,7 +465,7 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) io_ctl->pages[i] = page; if (uptodate && !PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, @@ -2630,16 +2630,19 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { - struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *sinfo = block_group->space_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; - const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold); + int bg_reclaim_threshold = 0; bool initial = (size == block_group->length); u64 reclaimable_unusable; WARN_ON(!initial && offset + size > block_group->zone_capacity); + if (!initial) + bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold); + spin_lock(&ctl->tree_lock); if (!used) to_free = size; @@ -3533,7 +3536,8 @@ int btrfs_find_space_cluster(struct btrfs_block_group *block_group, * data, keep it dense. */ if (btrfs_test_opt(fs_info, SSD_SPREAD)) { - cont1_bytes = min_bytes = bytes + empty_size; + cont1_bytes = bytes + empty_size; + min_bytes = cont1_bytes; } else if (block_group->flags & BTRFS_BLOCK_GROUP_METADATA) { cont1_bytes = bytes; min_bytes = fs_info->sectorsize; @@ -4069,7 +4073,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, btrfs_info(fs_info, "cleaning free space cache v1"); - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); ret = btrfs_remove_free_space_inode(trans, NULL, block_group); diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index 0ae54d8c10d6..1bf89aa67216 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1178,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) goto abort; } - node = rb_first(&fs_info->block_group_cache_tree); + node = rb_first_cached(&fs_info->block_group_cache_tree); while (node) { block_group = rb_entry(node, struct btrfs_block_group, cache_node); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 95c499b8424e..f0c97d25b4a0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -64,8 +64,36 @@ struct btrfs_iget_args { struct btrfs_dio_data { ssize_t submitted; struct extent_changeset *data_reserved; + bool data_space_reserved; + bool nocow_done; }; +struct btrfs_dio_private { + struct inode *inode; + + /* + * Since DIO can use anonymous page, we cannot use page_offset() to + * grab the file offset, thus need a dedicated member for file offset. + */ + u64 file_offset; + /* Used for bio::bi_size */ + u32 bytes; + + /* + * References to this structure. There is one reference per in-flight + * bio plus one while we're still setting up. + */ + refcount_t refs; + + /* Array of checksums */ + u8 *csums; + + /* This must be last */ + struct bio bio; +}; + +static struct bio_set btrfs_dio_bioset; + struct btrfs_rename_ctx { /* Output field. Stores the index number of the old directory entry. */ u64 index; @@ -86,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep; static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode, bool skip_writeback); -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock); + unsigned long *nr_written, int unlock, + u64 *done_offset); static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, u64 len, u64 orig_start, u64 block_start, u64 block_len, u64 orig_block_len, u64 ram_bytes, int compress_type, int type); -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate); - /* * btrfs_inode_lock - lock inode i_rwsem based on arguments passed * @@ -167,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, { unsigned long index = offset >> PAGE_SHIFT; unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT; - u64 page_start = page_offset(locked_page); - u64 page_end = page_start + PAGE_SIZE - 1; - + u64 page_start, page_end; struct page *page; + if (locked_page) { + page_start = page_offset(locked_page); + page_end = page_start + PAGE_SIZE - 1; + } + while (index <= end_index) { /* * For locked page, we will call end_extent_writepage() on it @@ -184,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * btrfs_mark_ordered_io_finished() would skip the accounting * for the page range, and the ordered extent will never finish. */ - if (index == (page_offset(locked_page) >> PAGE_SHIFT)) { + if (locked_page && index == (page_start >> PAGE_SHIFT)) { index++; continue; } @@ -195,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, /* * Here we just clear all Ordered bits for every page in the - * range, then __endio_write_update_ordered() will handle + * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, @@ -203,34 +230,47 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, put_page(page); } - /* The locked page covers the full range, nothing needs to be done */ - if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE) - return; - /* - * In case this page belongs to the delalloc range being instantiated - * then skip it, since the first page of a range is going to be - * properly cleaned up by the caller of run_delalloc_range - */ - if (page_start >= offset && page_end <= (offset + bytes - 1)) { - bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; - offset = page_offset(locked_page) + PAGE_SIZE; + if (locked_page) { + /* The locked page covers the full range, nothing needs to be done */ + if (bytes + offset <= page_start + PAGE_SIZE) + return; + /* + * In case this page belongs to the delalloc range being + * instantiated then skip it, since the first page of a range is + * going to be properly cleaned up by the caller of + * run_delalloc_range + */ + if (page_start >= offset && page_end <= (offset + bytes - 1)) { + bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE; + offset = page_offset(locked_page) + PAGE_SIZE; + } } - return __endio_write_update_ordered(inode, offset, bytes, false); + return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false); } static int btrfs_dirty_inode(struct inode *inode); static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, - struct inode *inode, struct inode *dir, - const struct qstr *qstr) + struct btrfs_new_inode_args *args) { int err; - err = btrfs_init_acl(trans, inode, dir); - if (!err) - err = btrfs_xattr_security_init(trans, inode, dir, qstr); - return err; + if (args->default_acl) { + err = __btrfs_set_acl(trans, args->inode, args->default_acl, + ACL_TYPE_DEFAULT); + if (err) + return err; + } + if (args->acl) { + err = __btrfs_set_acl(trans, args->inode, args->acl, ACL_TYPE_ACCESS); + if (err) + return err; + } + if (!args->default_acl && !args->acl) + cache_no_acl(args->inode); + return btrfs_xattr_security_init(trans, args->inode, args->dir, + &args->dentry->d_name); } /* @@ -294,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, cur_size = min_t(unsigned long, compressed_size, PAGE_SIZE); - kaddr = kmap_atomic(cpage); + kaddr = kmap_local_page(cpage); write_extent_buffer(leaf, kaddr, ptr, cur_size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); i++; ptr += cur_size; @@ -307,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, } else { page = find_get_page(inode->vfs_inode.i_mapping, 0); btrfs_set_file_extent_compression(leaf, ei, 0); - kaddr = kmap_atomic(page); + kaddr = kmap_local_page(page); write_extent_buffer(leaf, kaddr, ptr, size); - kunmap_atomic(kaddr); + kunmap_local(kaddr); put_page(page); } btrfs_mark_buffer_dirty(leaf); @@ -447,7 +487,7 @@ struct async_chunk { struct page *locked_page; u64 start; u64 end; - unsigned int write_flags; + blk_opf_t write_flags; struct list_head extents; struct cgroup_subsys_state *blkcg_css; struct btrfs_work work; @@ -522,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, * will unlock the full page. */ if (fs_info->sectorsize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(end + 1, PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(end + 1)) return 0; } @@ -640,8 +680,8 @@ again: * Thus we must also check against @actual_end, not just @end. */ if (blocksize < PAGE_SIZE) { - if (!IS_ALIGNED(start, PAGE_SIZE) || - !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE)) + if (!PAGE_ALIGNED(start) || + !PAGE_ALIGNED(round_up(actual_end, blocksize))) goto cleanup_and_bail_uncompressed; } @@ -882,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode, * can directly submit them without interruption. */ ret = cow_file_range(inode, locked_page, start, end, &page_started, - &nr_written, 0); + &nr_written, 0, NULL); /* Inline extent inserted, page gets unlocked and everything is done */ if (page_started) { ret = 0; goto out; } if (ret < 0) { - if (locked_page) + btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1); + if (locked_page) { + const u64 page_start = page_offset(locked_page); + const u64 page_end = page_start + PAGE_SIZE - 1; + + btrfs_page_set_error(inode->root->fs_info, locked_page, + page_start, PAGE_SIZE); + set_page_writeback(locked_page); + end_page_writeback(locked_page); + end_extent_writepage(locked_page, ret, page_start, page_end); unlock_page(locked_page); + } goto out; } @@ -1095,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * *page_started is set to one if we unlock locked_page and do everything * required to start IO on it. It may be clean and already done with * IO when we return. + * + * When unlock == 1, we unlock the pages in successfully allocated regions. + * When unlock == 0, we leave them locked for writing them out. + * + * However, we unlock all the pages except @locked_page in case of failure. + * + * In summary, page locking state will be as follow: + * + * - page_started == 1 (return value) + * - All the pages are unlocked. IO is started. + * - Note that this can happen only on success + * - unlock == 1 + * - All the pages except @locked_page are unlocked in any case + * - unlock == 0 + * - On success, all the pages are locked for writing out them + * - On failure, all the pages except @locked_page are unlocked + * + * When a failure happens in the second or later iteration of the + * while-loop, the ordered extents created in previous iterations are kept + * intact. So, the caller must clean them up by calling + * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for + * example. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct page *locked_page, u64 start, u64 end, int *page_started, - unsigned long *nr_written, int unlock) + unsigned long *nr_written, int unlock, + u64 *done_offset) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; u64 alloc_hint = 0; + u64 orig_start = start; u64 num_bytes; unsigned long ram_size; u64 cur_alloc_size = 0; @@ -1291,18 +1365,62 @@ out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); out_unlock: + /* + * If done_offset is non-NULL and ret == -EAGAIN, we expect the + * caller to write out the successfully allocated region and retry. + */ + if (done_offset && ret == -EAGAIN) { + if (orig_start < start) + *done_offset = start - 1; + else + *done_offset = start; + return ret; + } else if (ret == -EAGAIN) { + /* Convert to -ENOSPC since the caller cannot retry. */ + ret = -ENOSPC; + } + + /* + * Now, we have three regions to clean up: + * + * |-------(1)----|---(2)---|-------------(3)----------| + * `- orig_start `- start `- start + cur_alloc_size `- end + * + * We process each region below. + */ + clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV; page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK; + + /* + * For the range (1). We have already instantiated the ordered extents + * for this region. They are cleaned up by + * btrfs_cleanup_ordered_extents() in e.g, + * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are + * already cleared in the above loop. And, EXTENT_DELALLOC_NEW | + * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup + * function. + * + * However, in case of unlock == 0, we still need to unlock the pages + * (except @locked_page) to ensure all the pages are unlocked. + */ + if (!unlock && orig_start < start) { + if (!locked_page) + mapping_set_error(inode->vfs_inode.i_mapping, ret); + extent_clear_unlock_delalloc(inode, orig_start, start - 1, + locked_page, 0, page_ops); + } + /* - * If we reserved an extent for our delalloc range (or a subrange) and - * failed to create the respective ordered extent, then it means that - * when we reserved the extent we decremented the extent's size from - * the data space_info's bytes_may_use counter and incremented the - * space_info's bytes_reserved counter by the same amount. We must make - * sure extent_clear_unlock_delalloc() does not try to decrement again - * the data space_info's bytes_may_use counter, therefore we do not pass - * it the flag EXTENT_CLEAR_DATA_RESV. + * For the range (2). If we reserved an extent for our delalloc range + * (or a subrange) and failed to create the respective ordered extent, + * then it means that when we reserved the extent we decremented the + * extent's size from the data space_info's bytes_may_use counter and + * incremented the space_info's bytes_reserved counter by the same + * amount. We must make sure extent_clear_unlock_delalloc() does not try + * to decrement again the data space_info's bytes_may_use counter, + * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV. */ if (extent_reserved) { extent_clear_unlock_delalloc(inode, start, @@ -1312,12 +1430,19 @@ out_unlock: page_ops); start += cur_alloc_size; if (start >= end) - goto out; + return ret; } + + /* + * For the range (3). We never touched the region. In addition to the + * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data + * space_info's bytes_may_use counter, reserved in + * btrfs_check_data_free_space(). + */ extent_clear_unlock_delalloc(inode, start, end, locked_page, clear_bits | EXTENT_CLEAR_DATA_RESV, page_ops); - goto out; + return ret; } /* @@ -1397,7 +1522,7 @@ static int cow_file_range_async(struct btrfs_inode *inode, int i; bool should_compress; unsigned nofs_flag; - const unsigned int write_flags = wbc_to_write_flags(wbc); + const blk_opf_t write_flags = wbc_to_write_flags(wbc); unlock_extent(&inode->io_tree, start, end); @@ -1500,19 +1625,42 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode, u64 end, int *page_started, unsigned long *nr_written) { + u64 done_offset = end; int ret; + bool locked_page_done = false; - ret = cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 0); - if (ret) - return ret; + while (start <= end) { + ret = cow_file_range(inode, locked_page, start, end, page_started, + nr_written, 0, &done_offset); + if (ret && ret != -EAGAIN) + return ret; - if (*page_started) - return 0; + if (*page_started) { + ASSERT(ret == 0); + return 0; + } + + if (ret == 0) + done_offset = end; + + if (done_offset == start) { + struct btrfs_fs_info *info = inode->root->fs_info; + + wait_var_event(&info->zone_finish_wait, + !test_bit(BTRFS_FS_NEED_ZONE_FINISH, &info->flags)); + continue; + } + + if (!locked_page_done) { + __set_page_dirty_nobuffers(locked_page); + account_page_redirty(locked_page); + } + locked_page_done = true; + extent_write_locked_range(&inode->vfs_inode, start, done_offset); + + start = done_offset + 1; + } - __set_page_dirty_nobuffers(locked_page); - account_page_redirty(locked_page); - extent_write_locked_range(&inode->vfs_inode, start, end); *page_started = 1; return 0; @@ -1604,7 +1752,142 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, } return cow_file_range(inode, locked_page, start, end, page_started, - nr_written, 1); + nr_written, 1, NULL); +} + +struct can_nocow_file_extent_args { + /* Input fields. */ + + /* Start file offset of the range we want to NOCOW. */ + u64 start; + /* End file offset (inclusive) of the range we want to NOCOW. */ + u64 end; + bool writeback_path; + bool strict; + /* + * Free the path passed to can_nocow_file_extent() once it's not needed + * anymore. + */ + bool free_path; + + /* Output fields. Only set when can_nocow_file_extent() returns 1. */ + + u64 disk_bytenr; + u64 disk_num_bytes; + u64 extent_offset; + /* Number of bytes that can be written to in NOCOW mode. */ + u64 num_bytes; +}; + +/* + * Check if we can NOCOW the file extent that the path points to. + * This function may return with the path released, so the caller should check + * if path->nodes[0] is NULL or not if it needs to use the path afterwards. + * + * Returns: < 0 on error + * 0 if we can not NOCOW + * 1 if we can NOCOW + */ +static int can_nocow_file_extent(struct btrfs_path *path, + struct btrfs_key *key, + struct btrfs_inode *inode, + struct can_nocow_file_extent_args *args) +{ + const bool is_freespace_inode = btrfs_is_free_space_inode(inode); + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_root *root = inode->root; + struct btrfs_file_extent_item *fi; + u64 extent_end; + u8 extent_type; + int can_nocow = 0; + int ret = 0; + + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_INLINE) + goto out; + + /* Can't access these fields unless we know it's not an inline extent. */ + args->disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + args->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + args->extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (!(inode->flags & BTRFS_INODE_NODATACOW) && + extent_type == BTRFS_FILE_EXTENT_REG) + goto out; + + /* + * If the extent was created before the generation where the last snapshot + * for its subvolume was created, then this implies the extent is shared, + * hence we must COW. + */ + if (!args->strict && + btrfs_file_extent_generation(leaf, fi) <= + btrfs_root_last_snapshot(&root->root_item)) + goto out; + + /* An explicit hole, must COW. */ + if (args->disk_bytenr == 0) + goto out; + + /* Compressed/encrypted/encoded extents must be COWed. */ + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out; + + extent_end = btrfs_file_extent_end(path); + + /* + * The following checks can be expensive, as they need to take other + * locks and do btree or rbtree searches, so release the path to avoid + * blocking other tasks for too long. + */ + btrfs_release_path(path); + + ret = btrfs_cross_ref_exist(root, btrfs_ino(inode), + key->offset - args->extent_offset, + args->disk_bytenr, false, path); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + if (args->free_path) { + /* + * We don't need the path anymore, plus through the + * csum_exist_in_range() call below we will end up allocating + * another path. So free the path to avoid unnecessary extra + * memory usage. + */ + btrfs_free_path(path); + path = NULL; + } + + /* If there are pending snapshots for this root, we must COW. */ + if (args->writeback_path && !is_freespace_inode && + atomic_read(&root->snapshot_force_cow)) + goto out; + + args->disk_bytenr += args->extent_offset; + args->disk_bytenr += args->start - key->offset; + args->num_bytes = min(args->end + 1, extent_end) - args->start; + + /* + * Force COW if csums exist in the range. This ensures that csums for a + * given extent are either valid or do not exist. + */ + ret = csum_exist_in_range(root->fs_info, args->disk_bytenr, args->num_bytes); + WARN_ON_ONCE(ret > 0 && is_freespace_inode); + if (ret != 0) + goto out; + + can_nocow = 1; + out: + if (args->free_path && path) + btrfs_free_path(path); + + return ret < 0 ? ret : can_nocow; } /* @@ -1627,11 +1910,10 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 cur_offset = start; int ret; bool check_prev = true; - const bool freespace_inode = btrfs_is_free_space_inode(inode); u64 ino = btrfs_ino(inode); + struct btrfs_block_group *bg; bool nocow = false; - u64 disk_bytenr = 0; - const bool force = inode->flags & BTRFS_INODE_NODATACOW; + struct can_nocow_file_extent_args nocow_args = { 0 }; path = btrfs_alloc_path(); if (!path) { @@ -1644,15 +1926,16 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, return -ENOMEM; } + nocow_args.end = end; + nocow_args.writeback_path = true; + while (1) { struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; u64 extent_end; - u64 extent_offset; - u64 num_bytes = 0; - u64 disk_num_bytes; u64 ram_bytes; + u64 nocow_end; int extent_type; nocow = false; @@ -1728,116 +2011,38 @@ next_slot: fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); extent_type = btrfs_file_extent_type(leaf, fi); - + /* If this is triggered then we have a memory corruption. */ + ASSERT(extent_type < BTRFS_NR_FILE_EXTENT_TYPES); + if (WARN_ON(extent_type >= BTRFS_NR_FILE_EXTENT_TYPES)) { + ret = -EUCLEAN; + goto error; + } ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - if (extent_type == BTRFS_FILE_EXTENT_REG || - extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - extent_offset = btrfs_file_extent_offset(leaf, fi); - extent_end = found_key.offset + - btrfs_file_extent_num_bytes(leaf, fi); - disk_num_bytes = - btrfs_file_extent_disk_num_bytes(leaf, fi); - /* - * If the extent we got ends before our current offset, - * skip to the next extent. - */ - if (extent_end <= cur_offset) { - path->slots[0]++; - goto next_slot; - } - /* Skip holes */ - if (disk_bytenr == 0) - goto out_check; - /* Skip compressed/encrypted/encoded extents */ - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out_check; - /* - * If extent is created before the last volume's snapshot - * this implies the extent is shared, hence we can't do - * nocow. This is the same check as in - * btrfs_cross_ref_exist but without calling - * btrfs_search_slot. - */ - if (!freespace_inode && - btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item)) - goto out_check; - if (extent_type == BTRFS_FILE_EXTENT_REG && !force) - goto out_check; + extent_end = btrfs_file_extent_end(path); - /* - * The following checks can be expensive, as they need to - * take other locks and do btree or rbtree searches, so - * release the path to avoid blocking other tasks for too - * long. - */ - btrfs_release_path(path); + /* + * If the extent we got ends before our current offset, skip to + * the next extent. + */ + if (extent_end <= cur_offset) { + path->slots[0]++; + goto next_slot; + } - ret = btrfs_cross_ref_exist(root, ino, - found_key.offset - - extent_offset, disk_bytenr, false); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } + nocow_args.start = cur_offset; + ret = can_nocow_file_extent(path, &found_key, inode, &nocow_args); + if (ret < 0) { + if (cow_start != (u64)-1) + cur_offset = cow_start; + goto error; + } else if (ret == 0) { + goto out_check; + } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - disk_bytenr += extent_offset; - disk_bytenr += cur_offset - found_key.offset; - num_bytes = min(end + 1, extent_end) - cur_offset; - /* - * If there are pending snapshots for this root, we - * fall into common COW way - */ - if (!freespace_inode && atomic_read(&root->snapshot_force_cow)) - goto out_check; - /* - * force cow if csum exists in the range. - * this ensure that csum for a given extent are - * either valid or do not exist. - */ - ret = csum_exist_in_range(fs_info, disk_bytenr, - num_bytes); - if (ret) { - /* - * ret could be -EIO if the above fails to read - * metadata. - */ - if (ret < 0) { - if (cow_start != (u64)-1) - cur_offset = cow_start; - goto error; - } - WARN_ON_ONCE(freespace_inode); - goto out_check; - } - /* If the extent's block group is RO, we must COW */ - if (!btrfs_inc_nocow_writers(fs_info, disk_bytenr)) - goto out_check; + ret = 0; + bg = btrfs_inc_nocow_writers(fs_info, nocow_args.disk_bytenr); + if (bg) nocow = true; - } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { - extent_end = found_key.offset + ram_bytes; - extent_end = ALIGN(extent_end, fs_info->sectorsize); - /* Skip extents outside of our requested range */ - if (extent_end <= start) { - path->slots[0]++; - goto next_slot; - } - } else { - /* If this triggers then we have a memory corruption */ - BUG(); - } out_check: /* * If nocow is false then record the beginning of the range @@ -1869,15 +2074,17 @@ out_check: cow_start = (u64)-1; } + nocow_end = cur_offset + nocow_args.num_bytes - 1; + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { - u64 orig_start = found_key.offset - extent_offset; + u64 orig_start = found_key.offset - nocow_args.extent_offset; struct extent_map *em; - em = create_io_em(inode, cur_offset, num_bytes, + em = create_io_em(inode, cur_offset, nocow_args.num_bytes, orig_start, - disk_bytenr, /* block_start */ - num_bytes, /* block_len */ - disk_num_bytes, /* orig_block_len */ + nocow_args.disk_bytenr, /* block_start */ + nocow_args.num_bytes, /* block_len */ + nocow_args.disk_num_bytes, /* orig_block_len */ ram_bytes, BTRFS_COMPRESS_NONE, BTRFS_ORDERED_PREALLOC); if (IS_ERR(em)) { @@ -1886,20 +2093,23 @@ out_check: } free_extent_map(em); ret = btrfs_add_ordered_extent(inode, - cur_offset, num_bytes, num_bytes, - disk_bytenr, num_bytes, 0, + cur_offset, nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, 0, 1 << BTRFS_ORDERED_PREALLOC, BTRFS_COMPRESS_NONE); if (ret) { btrfs_drop_extent_cache(inode, cur_offset, - cur_offset + num_bytes - 1, - 0); + nocow_end, 0); goto error; } } else { ret = btrfs_add_ordered_extent(inode, cur_offset, - num_bytes, num_bytes, - disk_bytenr, num_bytes, + nocow_args.num_bytes, + nocow_args.num_bytes, + nocow_args.disk_bytenr, + nocow_args.num_bytes, 0, 1 << BTRFS_ORDERED_NOCOW, BTRFS_COMPRESS_NONE); @@ -1907,9 +2117,10 @@ out_check: goto error; } - if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); - nocow = false; + if (nocow) { + btrfs_dec_nocow_writers(bg); + nocow = false; + } if (btrfs_is_data_reloc_root(root)) /* @@ -1918,10 +2129,9 @@ out_check: * from freeing metadata of created ordered extent. */ ret = btrfs_reloc_clone_csums(inode, cur_offset, - num_bytes); + nocow_args.num_bytes); - extent_clear_unlock_delalloc(inode, cur_offset, - cur_offset + num_bytes - 1, + extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, locked_page, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_CLEAR_DATA_RESV, @@ -1954,7 +2164,7 @@ out_check: error: if (nocow) - btrfs_dec_nocow_writers(fs_info, disk_bytenr); + btrfs_dec_nocow_writers(bg); if (ret && cur_offset < end) extent_clear_unlock_delalloc(inode, cur_offset, end, @@ -2015,7 +2225,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page page_started, nr_written); else ret = cow_file_range(inode, locked_page, start, end, - page_started, nr_written, 1); + page_started, nr_written, 1, NULL); } else { set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags); ret = cow_file_range_async(inode, wbc, locked_page, start, end, @@ -2031,6 +2241,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page void btrfs_split_delalloc_extent(struct inode *inode, struct extent_state *orig, u64 split) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 size; /* not delalloc, ignore it */ @@ -2038,7 +2249,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, return; size = orig->end - orig->start + 1; - if (size > BTRFS_MAX_EXTENT_SIZE) { + if (size > fs_info->max_extent_size) { u32 num_extents; u64 new_size; @@ -2047,10 +2258,10 @@ void btrfs_split_delalloc_extent(struct inode *inode, * applies here, just in reverse. */ new_size = orig->end - split + 1; - num_extents = count_max_extents(new_size); + num_extents = count_max_extents(fs_info, new_size); new_size = split - orig->start; - num_extents += count_max_extents(new_size); - if (count_max_extents(size) >= num_extents) + num_extents += count_max_extents(fs_info, new_size); + if (count_max_extents(fs_info, size) >= num_extents) return; } @@ -2067,6 +2278,7 @@ void btrfs_split_delalloc_extent(struct inode *inode, void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, struct extent_state *other) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); u64 new_size, old_size; u32 num_extents; @@ -2080,7 +2292,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, new_size = other->end - new->start + 1; /* we're not bigger than the max, unreserve the space and go */ - if (new_size <= BTRFS_MAX_EXTENT_SIZE) { + if (new_size <= fs_info->max_extent_size) { spin_lock(&BTRFS_I(inode)->lock); btrfs_mod_outstanding_extents(BTRFS_I(inode), -1); spin_unlock(&BTRFS_I(inode)->lock); @@ -2106,10 +2318,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new, * this case. */ old_size = other->end - other->start + 1; - num_extents = count_max_extents(old_size); + num_extents = count_max_extents(fs_info, old_size); old_size = new->end - new->start + 1; - num_extents += count_max_extents(old_size); - if (count_max_extents(new_size) >= num_extents) + num_extents += count_max_extents(fs_info, old_size); + if (count_max_extents(fs_info, new_size) >= num_extents) return; spin_lock(&BTRFS_I(inode)->lock); @@ -2174,21 +2386,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root, * list of inodes that have pending delalloc work to be done. */ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, - unsigned *bits) + u32 bits) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC)) WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode)); spin_lock(&BTRFS_I(inode)->lock); @@ -2203,7 +2415,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; - if (*bits & EXTENT_DEFRAG) + if (bits & EXTENT_DEFRAG) BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) @@ -2212,7 +2424,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, } if (!(state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 - state->start; @@ -2225,14 +2437,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state, * accounting happens. */ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, - struct extent_state *state, unsigned *bits) + struct extent_state *state, u32 bits) { struct btrfs_inode *inode = BTRFS_I(vfs_inode); struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb); u64 len = state->end + 1 - state->start; - u32 num_extents = count_max_extents(len); + u32 num_extents = count_max_extents(fs_info, len); - if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) { + if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) { spin_lock(&inode->lock); inode->defrag_bytes -= len; spin_unlock(&inode->lock); @@ -2243,7 +2455,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * but in this case, we are only testing for the DELALLOC * bit, which is only set or cleared with irqs on */ - if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { + if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = inode->root; bool do_list = !btrfs_is_free_space_inode(inode); @@ -2256,7 +2468,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, * don't need to call delalloc_release_metadata if there is an * error. */ - if (*bits & EXTENT_CLEAR_META_RESV && + if (bits & EXTENT_CLEAR_META_RESV && root != fs_info->tree_root) btrfs_delalloc_release_metadata(inode, len, false); @@ -2266,7 +2478,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, if (!btrfs_is_data_reloc_root(root) && do_list && !(state->state & EXTENT_NORESERVE) && - (*bits & EXTENT_CLEAR_DATA_RESV)) + (bits & EXTENT_CLEAR_DATA_RESV)) btrfs_free_reserved_data_space_noquota(fs_info, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, @@ -2281,11 +2493,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode, } if ((state->state & EXTENT_DELALLOC_NEW) && - (*bits & EXTENT_DELALLOC_NEW)) { + (bits & EXTENT_DELALLOC_NEW)) { spin_lock(&inode->lock); ASSERT(inode->new_delalloc_bytes >= len); inode->new_delalloc_bytes -= len; - if (*bits & EXTENT_ADD_INODE_BYTES) + if (bits & EXTENT_ADD_INODE_BYTES) inode_add_bytes(&inode->vfs_inode, len); spin_unlock(&inode->lock); } @@ -2480,100 +2692,78 @@ out: return errno_to_blk_status(ret); } -/* - * extent_io.c submission hook. This does the right thing for csum calculation - * on write, or reading the csums from the tree before a read. - * - * Rules about async/sync submit, - * a) read: sync submit - * - * b) write without checksum: sync submit - * - * c) write with checksum: - * c-1) if bio is issued by fsync: sync submit - * (sync_writers != 0) - * - * c-2) if root is reloc root: sync submit - * (only in case of buffered IO) - * - * c-3) otherwise: async submit - */ -blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, - int mirror_num, unsigned long bio_flags) - +void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_root *root = BTRFS_I(inode)->root; - enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA; - blk_status_t ret = 0; - int skip_sum; - int async = !atomic_read(&BTRFS_I(inode)->sync_writers); - - skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) || - test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state); - - if (btrfs_is_free_space_inode(BTRFS_I(inode))) - metadata = BTRFS_WQ_ENDIO_FREE_SPACE; + struct btrfs_inode *bi = BTRFS_I(inode); + blk_status_t ret; if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - struct page *page = bio_first_bvec_all(bio)->bv_page; - loff_t file_offset = page_offset(page); - - ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); + ret = extract_ordered_extent(bi, bio, + page_offset(bio_first_bvec_all(bio)->bv_page)); if (ret) goto out; } - if (btrfs_op(bio) != BTRFS_MAP_WRITE) { - ret = btrfs_bio_wq_end_io(fs_info, bio, metadata); - if (ret) - goto out; + /* + * If we need to checksum, and the I/O is not issued by fsync and + * friends, that is ->sync_writers != 0, defer the submission to a + * workqueue to parallelize it. + * + * Csum items for reloc roots have already been cloned at this point, + * so they are handled as part of the no-checksum case. + */ + if (!(bi->flags & BTRFS_INODE_NODATASUM) && + !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) && + !btrfs_is_data_reloc_root(bi->root)) { + if (!atomic_read(&bi->sync_writers) && + btrfs_wq_submit_bio(inode, bio, mirror_num, 0, + btrfs_submit_bio_start)) + return; - if (bio_flags & EXTENT_BIO_COMPRESSED) { - /* - * btrfs_submit_compressed_read will handle completing - * the bio if there were any errors, so just return - * here. - */ - ret = btrfs_submit_compressed_read(inode, bio, - mirror_num, - bio_flags); - goto out_no_endio; - } else { - /* - * Lookup bio sums does extra checks around whether we - * need to csum or not, which is why we ignore skip_sum - * here. - */ - ret = btrfs_lookup_bio_sums(inode, bio, NULL); - if (ret) - goto out; - } - goto mapit; - } else if (async && !skip_sum) { - /* csum items have already been cloned */ - if (btrfs_is_data_reloc_root(root)) - goto mapit; - /* we're doing a write, do the async checksumming */ - ret = btrfs_wq_submit_bio(inode, bio, mirror_num, bio_flags, - 0, btrfs_submit_bio_start); - goto out; - } else if (!skip_sum) { - ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false); + ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false); if (ret) goto out; } + btrfs_submit_bio(fs_info, bio, mirror_num); + return; +out: + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + } +} + +void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio, + int mirror_num, enum btrfs_compression_type compress_type) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + blk_status_t ret; + + if (compress_type != BTRFS_COMPRESS_NONE) { + /* + * btrfs_submit_compressed_read will handle completing the bio + * if there were any errors, so just return here. + */ + btrfs_submit_compressed_read(inode, bio, mirror_num); + return; + } -mapit: - ret = btrfs_map_bio(fs_info, bio, mirror_num); + /* Save the original iter for read repair */ + btrfs_bio(bio)->iter = bio->bi_iter; -out: + /* + * Lookup bio sums does extra checks around whether we need to csum or + * not, which is why we ignore skip_sum here. + */ + ret = btrfs_lookup_bio_sums(inode, bio, NULL); if (ret) { bio->bi_status = ret; bio_endio(bio); + return; } -out_no_endio: - return ret; + + btrfs_submit_bio(fs_info, bio, mirror_num); } /* @@ -2980,8 +3170,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi, oe->disk_num_bytes); btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset); - if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) - num_bytes = ram_bytes = oe->truncated_len; + if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) { + num_bytes = oe->truncated_len; + ram_bytes = num_bytes; + } btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes); btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes); btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); @@ -3007,7 +3199,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, * an ordered extent if the range of bytes in the file it covers are * fully written. */ -static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) { struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode); struct btrfs_root *root = inode->root; @@ -3100,6 +3292,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); @@ -3214,71 +3408,75 @@ out: return ret; } -static void finish_ordered_fn(struct btrfs_work *work) -{ - struct btrfs_ordered_extent *ordered_extent; - ordered_extent = container_of(work, struct btrfs_ordered_extent, work); - btrfs_finish_ordered_io(ordered_extent); -} - void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode, struct page *page, u64 start, u64 end, bool uptodate) { trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate); - btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, - finish_ordered_fn, uptodate); + btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate); +} + +/* + * Verify the checksum for a single sector without any extra action that depend + * on the type of I/O. + */ +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, + u32 pgoff, u8 *csum, const u8 * const csum_expected) +{ + SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); + char *kaddr; + + ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); + + shash->tfm = fs_info->csum_shash; + + kaddr = kmap_local_page(page) + pgoff; + crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); + kunmap_local(kaddr); + + if (memcmp(csum, csum_expected, fs_info->csum_size)) + return -EIO; + return 0; } /* * check_data_csum - verify checksum of one sector of uncompressed data * @inode: inode - * @io_bio: btrfs_io_bio which contains the csum + * @bbio: btrfs_bio which contains the csum * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page - * @start: logical offset in the file * * The length of such check is always one sector size. + * + * When csum mismatch is detected, we will also report the error and fill the + * corrupted range with zero. (Thus it needs the extra parameters) */ -static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio, - u32 bio_offset, struct page *page, u32 pgoff, - u64 start) +int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio, + u32 bio_offset, struct page *page, u32 pgoff) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; u32 len = fs_info->sectorsize; - const u32 csum_size = fs_info->csum_size; - unsigned int offset_sectors; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; ASSERT(pgoff + len <= PAGE_SIZE); - offset_sectors = bio_offset >> fs_info->sectorsize_bits; - csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size; - - kaddr = kmap_atomic(page); - shash->tfm = fs_info->csum_shash; - - crypto_shash_digest(shash, kaddr + pgoff, len, csum); + csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset); - if (memcmp(csum, csum_expected, csum_size)) + if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected)) goto zeroit; - - kunmap_atomic(kaddr); return 0; + zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, - bbio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), + bbio->file_offset + bio_offset, + csum, csum_expected, bbio->mirror_num); if (bbio->device) btrfs_dev_stat_inc_and_print(bbio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); - memset(kaddr + pgoff, 1, len); - flush_dcache_page(page); - kunmap_atomic(kaddr); + memzero_page(page, pgoff, len); return -EIO; } @@ -3306,11 +3504,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, u32 pg_off; unsigned int result = 0; - if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) { - btrfs_page_clear_checked(fs_info, page, start, end + 1 - start); - return 0; - } - /* * This only happens for NODATASUM or compressed read. * Normally this should be covered by above check for compressed read @@ -3343,8 +3536,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio, EXTENT_NODATASUM); continue; } - ret = check_data_csum(inode, bbio, bio_offset, page, pg_off, - page_offset(page) + pg_off); + ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off); if (ret < 0) { const int nr_bit = (pg_off - offset_in_page(start)) >> root->fs_info->sectorsize_bits; @@ -4133,7 +4325,7 @@ skip_backref: /* * If we are in a rename context, we don't need to update anything in the * log. That will be done later during the rename by btrfs_log_new_name(). - * Besides that, doing it here would only cause extra unncessary btree + * Besides that, doing it here would only cause extra unnecessary btree * operations on the log tree, increasing latency for applications. */ if (!rename_ctx) { @@ -4161,8 +4353,9 @@ err: btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2); inode_inc_iversion(&inode->vfs_inode); inode_inc_iversion(&dir->vfs_inode); - inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime = - dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode); + dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime; + dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime; ret = btrfs_update_inode(trans, root, dir); out: return ret; @@ -4199,8 +4392,9 @@ static struct btrfs_trans_handle *__unlink_start_trans(struct inode *dir) * 1 for the dir index * 1 for the inode ref * 1 for the inode + * 1 for the parent inode */ - return btrfs_start_transaction_fallback_global_rsv(root, 5); + return btrfs_start_transaction_fallback_global_rsv(root, 6); } static int btrfs_unlink(struct inode *dir, struct dentry *dentry) @@ -4323,7 +4517,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2); inode_inc_iversion(dir); - dir->i_mtime = dir->i_ctime = current_time(dir); + dir->i_mtime = current_time(dir); + dir->i_ctime = dir->i_mtime; ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir)); if (ret) btrfs_abort_transaction(trans, ret); @@ -4693,7 +4888,7 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, goto out; } } - ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize); + ret = btrfs_delalloc_reserve_metadata(inode, blocksize, blocksize, false); if (ret < 0) { if (!only_release_metadata) btrfs_free_reserved_data_space(inode, data_reserved, @@ -4714,7 +4909,7 @@ again: goto out_unlock; if (!PageUptodate(page)) { - ret = btrfs_readpage(NULL, page); + ret = btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping) { unlock_page(page); @@ -4762,7 +4957,6 @@ again: else memzero_page(page, (block_start - page_offset(page)) + offset, len); - flush_dcache_page(page); } btrfs_page_clear_checked(fs_info, page, block_start, block_end + 1 - block_start); @@ -4965,9 +5159,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) */ if (newsize != oldsize) { inode_inc_iversion(inode); - if (!(mask & (ATTR_CTIME | ATTR_MTIME))) - inode->i_ctime = inode->i_mtime = - current_time(inode); + if (!(mask & (ATTR_CTIME | ATTR_MTIME))) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } } if (newsize > oldsize) { @@ -5275,7 +5470,7 @@ void btrfs_evict_inode(struct inode *inode) if (!rsv) goto no_delete; rsv->size = btrfs_calc_metadata_size(fs_info, 1); - rsv->failfast = 1; + rsv->failfast = true; btrfs_i_size_write(BTRFS_I(inode), 0); @@ -5667,14 +5862,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (ret != -ENOENT) inode = ERR_PTR(ret); else - inode = new_simple_dir(dir->i_sb, &location, sub_root); + inode = new_simple_dir(dir->i_sb, &location, root); } else { inode = btrfs_iget(dir->i_sb, location.objectid, sub_root); - } - if (root != sub_root) btrfs_put_root(sub_root); - if (!IS_ERR(inode) && root != sub_root) { + if (IS_ERR(inode)) + return inode; + down_read(&fs_info->cleanup_work_sem); if (!sb_rdonly(inode->i_sb)) ret = btrfs_orphan_cleanup(sub_root); @@ -5780,8 +5975,6 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct list_head ins_list; struct list_head del_list; int ret; - struct extent_buffer *leaf; - int slot; char *name_ptr; int name_len; int entries = 0; @@ -5808,35 +6001,19 @@ again: key.offset = ctx->pos; key.objectid = btrfs_ino(BTRFS_I(inode)); - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, ret) { struct dir_entry *entry; - - leaf = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); + struct extent_buffer *leaf = path->nodes[0]; if (found_key.objectid != key.objectid) break; if (found_key.type != BTRFS_DIR_INDEX_KEY) break; if (found_key.offset < ctx->pos) - goto next; + continue; if (btrfs_should_delete_dir_index(&del_list, found_key.offset)) - goto next; - di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + continue; + di = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); name_len = btrfs_dir_name_len(leaf, di); if ((total_len + sizeof(struct dir_entry) + name_len) >= PAGE_SIZE) { @@ -5863,9 +6040,11 @@ again: entries++; addr += sizeof(struct dir_entry) + name_len; total_len += sizeof(struct dir_entry) + name_len; -next: - path->slots[0]++; } + /* Catch error encountered during iteration */ + if (ret < 0) + goto err; + btrfs_release_path(path); ret = btrfs_filldir(private->filldir_buf, entries, ctx); @@ -6053,6 +6232,57 @@ static int btrfs_insert_inode_locked(struct inode *inode) btrfs_find_actor, &args); } +int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args, + unsigned int *trans_num_items) +{ + struct inode *dir = args->dir; + struct inode *inode = args->inode; + int ret; + + ret = posix_acl_create(dir, &inode->i_mode, &args->default_acl, &args->acl); + if (ret) + return ret; + + /* 1 to add inode item */ + *trans_num_items = 1; + /* 1 to add compression property */ + if (BTRFS_I(dir)->prop_compress) + (*trans_num_items)++; + /* 1 to add default ACL xattr */ + if (args->default_acl) + (*trans_num_items)++; + /* 1 to add access ACL xattr */ + if (args->acl) + (*trans_num_items)++; +#ifdef CONFIG_SECURITY + /* 1 to add LSM xattr */ + if (dir->i_security) + (*trans_num_items)++; +#endif + if (args->orphan) { + /* 1 to add orphan item */ + (*trans_num_items)++; + } else { + /* + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item + * + * No need for 1 unit for the inode ref item because it is + * inserted in a batch together with the inode item at + * btrfs_create_new_inode(). + */ + *trans_num_items += 3; + } + return 0; +} + +void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args) +{ + posix_acl_release(args->acl); + posix_acl_release(args->default_acl); +} + /* * Inherit flags from the parent inode. * @@ -6062,9 +6292,6 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) { unsigned int flags; - if (!dir) - return; - flags = BTRFS_I(dir)->flags; if (flags & BTRFS_INODE_NOCOMPRESS) { @@ -6084,76 +6311,86 @@ static void btrfs_inherit_iflags(struct inode *inode, struct inode *dir) btrfs_sync_inode_flags_to_i_flags(inode); } -static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - const char *name, int name_len, - u64 ref_objectid, u64 objectid, - umode_t mode, u64 *index) +int btrfs_create_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_new_inode_args *args) { - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct inode *dir = args->dir; + struct inode *inode = args->inode; + const char *name = args->orphan ? NULL : args->dentry->d_name.name; + int name_len = args->orphan ? 0 : args->dentry->d_name.len; + struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); + struct btrfs_root *root; struct btrfs_inode_item *inode_item; struct btrfs_key *location; struct btrfs_path *path; + u64 objectid; struct btrfs_inode_ref *ref; struct btrfs_key key[2]; u32 sizes[2]; struct btrfs_item_batch batch; unsigned long ptr; - unsigned int nofs_flag; int ret; path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); - - nofs_flag = memalloc_nofs_save(); - inode = new_inode(fs_info->sb); - memalloc_nofs_restore(nofs_flag); - if (!inode) { - btrfs_free_path(path); - return ERR_PTR(-ENOMEM); - } + return -ENOMEM; - /* - * O_TMPFILE, set link count to 0, so that after this point, - * we fill in an inode item with the correct link count. - */ - if (!name) - set_nlink(inode, 0); + if (!args->subvol) + BTRFS_I(inode)->root = btrfs_grab_root(BTRFS_I(dir)->root); + root = BTRFS_I(inode)->root; - /* - * we have to initialize this early, so we can reclaim the inode - * number if we fail afterwards in this function. - */ + ret = btrfs_get_free_objectid(root, &objectid); + if (ret) + goto out; inode->i_ino = objectid; - if (dir && name) { + if (args->orphan) { + /* + * O_TMPFILE, set link count to 0, so that after this point, we + * fill in an inode item with the correct link count. + */ + set_nlink(inode, 0); + } else { trace_btrfs_inode_request(dir); - ret = btrfs_set_inode_index(BTRFS_I(dir), index); - if (ret) { - btrfs_free_path(path); - iput(inode); - return ERR_PTR(ret); - } - } else if (dir) { - *index = 0; + ret = btrfs_set_inode_index(BTRFS_I(dir), &BTRFS_I(inode)->dir_index); + if (ret) + goto out; } - /* - * index_cnt is ignored for everything but a dir, - * btrfs_set_inode_index_count has an explanation for the magic - * number - */ - BTRFS_I(inode)->index_cnt = 2; - BTRFS_I(inode)->dir_index = *index; - BTRFS_I(inode)->root = btrfs_grab_root(root); + /* index_cnt is ignored for everything but a dir. */ + BTRFS_I(inode)->index_cnt = BTRFS_DIR_START_INDEX; BTRFS_I(inode)->generation = trans->transid; inode->i_generation = BTRFS_I(inode)->generation; /* + * Subvolumes don't inherit flags from their parent directory. + * Originally this was probably by accident, but we probably can't + * change it now without compatibility issues. + */ + if (!args->subvol) + btrfs_inherit_iflags(inode, dir); + + if (S_ISREG(inode->i_mode)) { + if (btrfs_test_opt(fs_info, NODATASUM)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; + if (btrfs_test_opt(fs_info, NODATACOW)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; + } + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + location->type = BTRFS_INODE_ITEM_KEY; + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) { + if (!args->orphan) + BTRFS_I(dir)->index_cnt--; + goto out; + } + + /* * We could have gotten an inode number from somebody who was fsynced * and then removed in this same transaction, so let's just set full * sync since it will be a full sync anyway and this will blow away the @@ -6167,7 +6404,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[0] = sizeof(struct btrfs_inode_item); - if (name) { + if (!args->orphan) { /* * Start new inodes with an inode_ref. This is slightly more * efficient for small numbers of hard links since they will @@ -6176,64 +6413,101 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, */ key[1].objectid = objectid; key[1].type = BTRFS_INODE_REF_KEY; - key[1].offset = ref_objectid; - - sizes[1] = name_len + sizeof(*ref); - } - - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - location->type = BTRFS_INODE_ITEM_KEY; - - ret = btrfs_insert_inode_locked(inode); - if (ret < 0) { - iput(inode); - goto fail; + if (args->subvol) { + key[1].offset = objectid; + sizes[1] = 2 + sizeof(*ref); + } else { + key[1].offset = btrfs_ino(BTRFS_I(dir)); + sizes[1] = name_len + sizeof(*ref); + } } batch.keys = &key[0]; batch.data_sizes = &sizes[0]; - batch.total_data_size = sizes[0] + (name ? sizes[1] : 0); - batch.nr = name ? 2 : 1; + batch.total_data_size = sizes[0] + (args->orphan ? 0 : sizes[1]); + batch.nr = args->orphan ? 1 : 2; ret = btrfs_insert_empty_items(trans, root, path, &batch); - if (ret != 0) - goto fail_unlock; - - inode_init_owner(mnt_userns, inode, dir, mode); - inode_set_bytes(inode, 0); + if (ret != 0) { + btrfs_abort_transaction(trans, ret); + goto discard; + } inode->i_mtime = current_time(inode); inode->i_atime = inode->i_mtime; inode->i_ctime = inode->i_mtime; BTRFS_I(inode)->i_otime = inode->i_mtime; + /* + * We're going to fill the inode item now, so at this point the inode + * must be fully initialized. + */ + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_inode_item); memzero_extent_buffer(path->nodes[0], (unsigned long)inode_item, sizeof(*inode_item)); fill_inode_item(trans, path->nodes[0], inode_item, inode); - if (name) { + if (!args->orphan) { ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, struct btrfs_inode_ref); - btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); - btrfs_set_inode_ref_index(path->nodes[0], ref, *index); ptr = (unsigned long)(ref + 1); - write_extent_buffer(path->nodes[0], name, ptr, name_len); + if (args->subvol) { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, 2); + btrfs_set_inode_ref_index(path->nodes[0], ref, 0); + write_extent_buffer(path->nodes[0], "..", ptr, 2); + } else { + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, + BTRFS_I(inode)->dir_index); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + } } btrfs_mark_buffer_dirty(path->nodes[0]); + /* + * We don't need the path anymore, plus inheriting properties, adding + * ACLs, security xattrs, orphan item or adding the link, will result in + * allocating yet another path. So just free our path. + */ btrfs_free_path(path); + path = NULL; - btrfs_inherit_iflags(inode, dir); + if (args->subvol) { + struct inode *parent; - if (S_ISREG(mode)) { - if (btrfs_test_opt(fs_info, NODATASUM)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATASUM; - if (btrfs_test_opt(fs_info, NODATACOW)) - BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + /* + * Subvolumes inherit properties from their parent subvolume, + * not the directory they were created in. + */ + parent = btrfs_iget(fs_info->sb, BTRFS_FIRST_FREE_OBJECTID, + BTRFS_I(dir)->root); + if (IS_ERR(parent)) { + ret = PTR_ERR(parent); + } else { + ret = btrfs_inode_inherit_props(trans, inode, parent); + iput(parent); + } + } else { + ret = btrfs_inode_inherit_props(trans, inode, dir); + } + if (ret) { + btrfs_err(fs_info, + "error inheriting props for ino %llu (root %llu): %d", + btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, + ret); + } + + /* + * Subvolumes don't inherit ACLs or get passed to the LSM. This is + * probably a bug. + */ + if (!args->subvol) { + ret = btrfs_init_inode_security(trans, args); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } } inode_tree_add(inode); @@ -6243,21 +6517,29 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_update_root_times(trans, root); - ret = btrfs_inode_inherit_props(trans, inode, dir); - if (ret) - btrfs_err(fs_info, - "error inheriting props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), root->root_key.objectid, ret); + if (args->orphan) { + ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + } else { + ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, + name_len, 0, BTRFS_I(inode)->dir_index); + } + if (ret) { + btrfs_abort_transaction(trans, ret); + goto discard; + } - return inode; + return 0; -fail_unlock: +discard: + /* + * discard_new_inode() calls iput(), but the caller owns the reference + * to the inode. + */ + ihold(inode); discard_new_inode(inode); -fail: - if (dir && name) - BTRFS_I(dir)->index_cnt--; +out: btrfs_free_path(path); - return ERR_PTR(ret); + return ret; } /* @@ -6349,147 +6631,71 @@ fail_dir_item: return ret; } -static int btrfs_add_nondir(struct btrfs_trans_handle *trans, - struct btrfs_inode *dir, struct dentry *dentry, - struct btrfs_inode *inode, int backref, u64 index) -{ - int err = btrfs_add_link(trans, dir, inode, - dentry->d_name.name, dentry->d_name.len, - backref, index); - if (err > 0) - err = -EEXIST; - return err; -} - -static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) +static int btrfs_create_common(struct inode *dir, struct dentry *dentry, + struct inode *inode) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .inode = inode, + }; + unsigned int trans_num_items; + struct btrfs_trans_handle *trans; int err; - u64 objectid; - u64 index = 0; - - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - err = btrfs_get_free_objectid(root, &objectid); + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - btrfs_update_inode(trans, root, BTRFS_I(inode)); - d_instantiate_new(dentry, inode); + err = btrfs_create_new_inode(trans, &new_inode_args); + if (!err) + d_instantiate_new(dentry, inode); -out_unlock: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } -static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) +static int btrfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, dev_t rdev) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - int err; - u64 objectid; - u64 index = 0; + struct inode *inode; - /* - * 2 for inode item and ref - * 2 for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + return btrfs_create_common(dir, dentry, inode); +} - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_unlock; +static int btrfs_create(struct user_namespace *mnt_userns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool excl) +{ + struct inode *inode; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; - } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_unlock; - - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 0, index); - if (err) - goto out_unlock; - - d_instantiate_new(dentry, inode); - -out_unlock: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6535,8 +6741,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, ihold(inode); set_bit(BTRFS_INODE_COPY_EVERYTHING, &BTRFS_I(inode)->runtime_flags); - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, BTRFS_I(inode), - 1, index); + err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), + dentry->d_name.name, dentry->d_name.len, 1, index); if (err) { drop_inode = 1; @@ -6573,66 +6779,15 @@ fail: static int btrfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, umode_t mode) { - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct inode *inode = NULL; - struct btrfs_trans_handle *trans; - struct btrfs_root *root = BTRFS_I(dir)->root; - int err = 0; - u64 objectid = 0; - u64 index = 0; - - /* - * 2 items for inode and ref - * 2 items for dir items - * 1 for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - err = btrfs_get_free_objectid(root, &objectid); - if (err) - goto out_fail; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFDIR | mode, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_fail; - } + struct inode *inode; - /* these must be set before we unlock the inode */ + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_fail; - - btrfs_i_size_write(BTRFS_I(inode), 0); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (err) - goto out_fail; - - err = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - dentry->d_name.name, - dentry->d_name.len, 0, index); - if (err) - goto out_fail; - - d_instantiate_new(dentry, inode); - -out_fail: - btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } - btrfs_btree_balance_dirty(fs_info); - return err; + return btrfs_create_common(dir, dentry, inode); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -7141,6 +7296,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, u64 *ram_bytes, bool strict) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct can_nocow_file_extent_args nocow_args = { 0 }; struct btrfs_path *path; int ret; struct extent_buffer *leaf; @@ -7148,13 +7304,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; - u64 disk_bytenr; - u64 backref_offset; - u64 extent_end; - u64 num_bytes; - int slot; int found_type; - bool nocow = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW); path = btrfs_alloc_path(); if (!path) @@ -7165,18 +7315,17 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, if (ret < 0) goto out; - slot = path->slots[0]; if (ret == 1) { - if (slot == 0) { + if (path->slots[0] == 0) { /* can't find the item, must cow */ ret = 0; goto out; } - slot--; + path->slots[0]--; } ret = 0; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != btrfs_ino(BTRFS_I(inode)) || key.type != BTRFS_EXTENT_DATA_KEY) { /* not our file or wrong item type, must cow */ @@ -7188,55 +7337,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, goto out; } - fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, fi); - if (found_type != BTRFS_FILE_EXTENT_REG && - found_type != BTRFS_FILE_EXTENT_PREALLOC) { - /* not a regular extent, must cow */ - goto out; - } - - if (!nocow && found_type == BTRFS_FILE_EXTENT_REG) + if (btrfs_file_extent_end(path) <= offset) goto out; - extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); - if (extent_end <= offset) - goto out; + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, fi); + if (ram_bytes) + *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); - disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); - if (disk_bytenr == 0) - goto out; + nocow_args.start = offset; + nocow_args.end = offset + *len - 1; + nocow_args.strict = strict; + nocow_args.free_path = true; - if (btrfs_file_extent_compression(leaf, fi) || - btrfs_file_extent_encryption(leaf, fi) || - btrfs_file_extent_other_encoding(leaf, fi)) - goto out; + ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + /* can_nocow_file_extent() has freed the path. */ + path = NULL; - /* - * Do the same check as in btrfs_cross_ref_exist but without the - * unnecessary search. - */ - if (!strict && - (btrfs_file_extent_generation(leaf, fi) <= - btrfs_root_last_snapshot(&root->root_item))) + if (ret != 1) { + /* Treat errors as not being able to NOCOW. */ + ret = 0; goto out; - - backref_offset = btrfs_file_extent_offset(leaf, fi); - - if (orig_start) { - *orig_start = key.offset - backref_offset; - *orig_block_len = btrfs_file_extent_disk_num_bytes(leaf, fi); - *ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); } - if (btrfs_extent_readonly(fs_info, disk_bytenr)) + ret = 0; + if (btrfs_extent_readonly(fs_info, nocow_args.disk_bytenr)) goto out; - num_bytes = min(offset + *len, extent_end) - offset; - if (!nocow && found_type == BTRFS_FILE_EXTENT_PREALLOC) { + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; - range_end = round_up(offset + num_bytes, + range_end = round_up(offset + nocow_args.num_bytes, root->fs_info->sectorsize) - 1; ret = test_range_bit(io_tree, offset, range_end, EXTENT_DELALLOC, 0, NULL); @@ -7246,36 +7378,12 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } } - btrfs_release_path(path); + if (orig_start) + *orig_start = key.offset - nocow_args.extent_offset; + if (orig_block_len) + *orig_block_len = nocow_args.disk_num_bytes; - /* - * look for other files referencing this extent, if we - * find any we must cow - */ - - ret = btrfs_cross_ref_exist(root, btrfs_ino(BTRFS_I(inode)), - key.offset - backref_offset, disk_bytenr, - strict); - if (ret) { - ret = 0; - goto out; - } - - /* - * adjust disk_bytenr and num_bytes to cover just the bytes - * in this extent we are about to write. If there - * are any csums in that range we have to cow in order - * to keep the csums correct - */ - disk_bytenr += backref_offset; - disk_bytenr += offset - key.offset; - if (csum_exist_in_range(fs_info, disk_bytenr, num_bytes)) - goto out; - /* - * all of the above have passed, it is safe to overwrite this extent - * without cow - */ - *len = num_bytes; + *len = nocow_args.num_bytes; ret = 1; out: btrfs_free_path(path); @@ -7283,14 +7391,22 @@ out: } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, bool writing) + struct extent_state **cached_state, + unsigned int iomap_flags) { + const bool writing = (iomap_flags & IOMAP_WRITE); + const bool nowait = (iomap_flags & IOMAP_NOWAIT); + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct btrfs_ordered_extent *ordered; int ret = 0; while (1) { - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + if (nowait) { + if (!try_lock_extent(io_tree, lockstart, lockend)) + return -EAGAIN; + } else { + lock_extent_bits(io_tree, lockstart, lockend, cached_state); + } /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure there's no ordered @@ -7311,10 +7427,14 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + unlock_extent_cached(io_tree, lockstart, lockend, cached_state); if (ordered) { + if (nowait) { + btrfs_put_ordered_extent(ordered); + ret = -EAGAIN; + break; + } /* * If we are doing a DIO read and the ordered extent we * found is for a buffered write, we can not wait for it @@ -7334,7 +7454,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) btrfs_start_ordered_extent(ordered, 1); else - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; btrfs_put_ordered_extent(ordered); } else { /* @@ -7350,7 +7470,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, * ordered extent to complete while holding a lock on * that page. */ - ret = -ENOTBLK; + ret = nowait ? -EAGAIN : -ENOTBLK; } if (ret) @@ -7424,12 +7544,15 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, static int btrfs_get_blocks_direct_write(struct extent_map **map, struct inode *inode, struct btrfs_dio_data *dio_data, - u64 start, u64 len) + u64 start, u64 len, + unsigned int iomap_flags) { + const bool nowait = (iomap_flags & IOMAP_NOWAIT); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em = *map; int type; u64 block_start, orig_start, orig_block_len, ram_bytes; + struct btrfs_block_group *bg; bool can_nocow = false; bool space_reserved = false; u64 prev_len; @@ -7455,9 +7578,11 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, block_start = em->block_start + (start - em->start); if (can_nocow_extent(inode, start, &len, &orig_start, - &orig_block_len, &ram_bytes, false) == 1 && - btrfs_inc_nocow_writers(fs_info, block_start)) - can_nocow = true; + &orig_block_len, &ram_bytes, false) == 1) { + bg = btrfs_inc_nocow_writers(fs_info, block_start); + if (bg) + can_nocow = true; + } } prev_len = len; @@ -7465,12 +7590,15 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, struct extent_map *em2; /* We can NOCOW, so only need to reserve metadata space. */ - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len); + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); + if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) + ret = -EAGAIN; goto out; } space_reserved = true; @@ -7479,25 +7607,40 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, orig_start, block_start, len, orig_block_len, ram_bytes, type); - btrfs_dec_nocow_writers(fs_info, block_start); + btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { free_extent_map(em); - *map = em = em2; + *map = em2; + em = em2; } if (IS_ERR(em2)) { ret = PTR_ERR(em2); goto out; } + + dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ free_extent_map(em); *map = NULL; - /* We have to COW, so need to reserve metadata and data space. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), - &dio_data->data_reserved, - start, len); + if (nowait) + return -EAGAIN; + + /* + * If we could not allocate data space before locking the file + * range and we can't do a NOCOW write, then we have to fail. + */ + if (!dio_data->data_space_reserved) + return -ENOSPC; + + /* + * We have to COW and we have already reserved data space before, + * so now we reserve only metadata. + */ + ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), len, len, + false); if (ret < 0) goto out; space_reserved = true; @@ -7510,10 +7653,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, *map = em; len = min(len, em->len - (start - em->start)); if (len < prev_len) - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start + len, prev_len - len, - true); + btrfs_delalloc_release_metadata(BTRFS_I(inode), + prev_len - len, true); } /* @@ -7531,15 +7672,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, out: if (ret && space_reserved) { btrfs_delalloc_release_extents(BTRFS_I(inode), len); - if (can_nocow) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - dio_data->data_reserved, - start, len, true); - extent_changeset_free(dio_data->data_reserved); - dio_data->data_reserved = NULL; - } + btrfs_delalloc_release_metadata(BTRFS_I(inode), len, true); } return ret; } @@ -7548,51 +7681,90 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, loff_t length, unsigned int flags, struct iomap *iomap, struct iomap *srcmap) { + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; - struct btrfs_dio_data *dio_data = NULL; + struct btrfs_dio_data *dio_data = iter->private; u64 lockstart, lockend; const bool write = !!(flags & IOMAP_WRITE); int ret = 0; u64 len = length; + const u64 data_alloc_len = length; bool unlock_extents = false; + /* + * Cap the size of reads to that usually seen in buffered I/O as we need + * to allocate a contiguous array for the checksums. + */ if (!write) - len = min_t(u64, len, fs_info->sectorsize); + len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS); lockstart = start; lockend = start + len - 1; /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so we - * need to flush the dirty pages again to make absolutely sure that any - * outstanding dirty pages are on disk. + * iomap_dio_rw() only does filemap_write_and_wait_range(), which isn't + * enough if we've written compressed pages to this area, so we need to + * flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk - the first flush only starts + * compression on the data, while keeping the pages locked, so by the + * time the second flush returns we know bios for the compressed pages + * were submitted and finished, and the pages no longer under writeback. + * + * If we have a NOWAIT request and we have any pages in the range that + * are locked, likely due to compression still in progress, we don't want + * to block on page locks. We also don't want to block on pages marked as + * dirty or under writeback (same as for the non-compression case). + * iomap_dio_rw() did the same check, but after that and before we got + * here, mmap'ed writes may have happened or buffered reads started + * (readpage() and readahead(), which lock pages), as we haven't locked + * the file range yet. */ if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &BTRFS_I(inode)->runtime_flags)) { - ret = filemap_fdatawrite_range(inode->i_mapping, start, - start + length - 1); - if (ret) - return ret; + if (flags & IOMAP_NOWAIT) { + if (filemap_range_needs_writeback(inode->i_mapping, + lockstart, lockend)) + return -EAGAIN; + } else { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; + } } - dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); - if (!dio_data) - return -ENOMEM; - - iomap->private = dio_data; + memset(dio_data, 0, sizeof(*dio_data)); + /* + * We always try to allocate data space and must do it before locking + * the file range, to avoid deadlocks with concurrent writes to the same + * range if the range has several extents and the writes don't expand the + * current i_size (the inode lock is taken in shared mode). If we fail to + * allocate data space here we continue and later, after locking the + * file range, we fail with ENOSPC only if we figure out we can not do a + * NOCOW write. + */ + if (write && !(flags & IOMAP_NOWAIT)) { + ret = btrfs_check_data_free_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, data_alloc_len); + if (!ret) + dio_data->data_space_reserved = true; + else if (ret && !(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + goto err; + } /* * If this errors out it's because we couldn't invalidate pagecache for - * this range and we need to fallback to buffered. + * this range and we need to fallback to buffered IO, or we are doing a + * NOWAIT read/write and we need to block. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { - ret = -ENOTBLK; + ret = lock_extent_direct(inode, lockstart, lockend, &cached_state, flags); + if (ret < 0) goto err; - } em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, len); if (IS_ERR(em)) { @@ -7617,7 +7789,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); - ret = -ENOTBLK; + /* + * If we are in a NOWAIT context, return -EAGAIN in order to + * fallback to buffered IO. This is not only because we can + * block with buffered IO (no support for NOWAIT semantics at + * the moment) but also to avoid returning short reads to user + * space - this happens if we were able to read some data from + * previous non-compressed extents and then when we fallback to + * buffered IO, at btrfs_file_read_iter() by calling + * filemap_read(), we fail to fault in pages for the read buffer, + * in which case filemap_read() returns a short read (the number + * of bytes previously read is > 0, so it does not return -EFAULT). + */ + ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK; goto unlock_err; } @@ -7652,12 +7836,30 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) { ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, - start, len); + start, len, flags); if (ret < 0) goto unlock_err; unlock_extents = true; /* Recalc len in case the new em is smaller than requested */ len = min(len, em->len - (start - em->start)); + if (dio_data->data_space_reserved) { + u64 release_offset; + u64 release_len = 0; + + if (dio_data->nocow_done) { + release_offset = start; + release_len = data_alloc_len; + } else if (len < data_alloc_len) { + release_offset = start + len; + release_len = data_alloc_len - len; + } + + if (release_len > 0) + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + release_offset, + release_len); + } } else { /* * We need to unlock only the end area that we aren't using. @@ -7702,7 +7904,12 @@ unlock_err: unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - kfree(dio_data); + if (dio_data->data_space_reserved) { + btrfs_free_reserved_data_space(BTRFS_I(inode), + dio_data->data_reserved, + start, data_alloc_len); + extent_changeset_free(dio_data->data_reserved); + } return ret; } @@ -7710,23 +7917,24 @@ err: static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, ssize_t written, unsigned int flags, struct iomap *iomap) { - int ret = 0; - struct btrfs_dio_data *dio_data = iomap->private; + struct iomap_iter *iter = container_of(iomap, struct iomap_iter, iomap); + struct btrfs_dio_data *dio_data = iter->private; size_t submitted = dio_data->submitted; const bool write = !!(flags & IOMAP_WRITE); + int ret = 0; if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); - goto out; + return 0; } if (submitted < length) { pos += submitted; length -= submitted; if (write) - __endio_write_update_ordered(BTRFS_I(inode), pos, - length, false); + btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, + pos, length, false); else unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); @@ -7735,10 +7943,6 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (write) extent_changeset_free(dio_data->data_reserved); -out: - kfree(dio_data); - iomap->private = NULL; - return ret; } @@ -7751,40 +7955,31 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) if (!refcount_dec_and_test(&dip->refs)) return; - if (btrfs_op(dip->dio_bio) == BTRFS_MAP_WRITE) { - __endio_write_update_ordered(BTRFS_I(dip->inode), - dip->file_offset, - dip->bytes, - !dip->dio_bio->bi_status); + if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) { + btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL, + dip->file_offset, dip->bytes, + !dip->bio.bi_status); } else { unlock_extent(&BTRFS_I(dip->inode)->io_tree, dip->file_offset, dip->file_offset + dip->bytes - 1); } - bio_endio(dip->dio_bio); - kfree(dip); + kfree(dip->csums); + bio_endio(&dip->bio); } -static blk_status_t submit_dio_repair_bio(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +static void submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int mirror_num, + enum btrfs_compression_type compress_type) { struct btrfs_dio_private *dip = bio->bi_private; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - blk_status_t ret; BUG_ON(bio_op(bio) == REQ_OP_WRITE); - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - return ret; - refcount_inc(&dip->refs); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) - refcount_dec(&dip->refs); - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); } static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, @@ -7793,56 +7988,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip, { struct inode *inode = dip->inode; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - const u32 sectorsize = fs_info->sectorsize; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - struct bio_vec bvec; - struct bvec_iter iter; - u32 bio_offset = 0; blk_status_t err = BLK_STS_OK; + struct bvec_iter iter; + struct bio_vec bv; + u32 offset; + + btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) { + u64 start = bbio->file_offset + offset; + + if (uptodate && + (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page, + bv.bv_offset))) { + clean_io_failure(fs_info, failure_tree, io_tree, start, + bv.bv_page, btrfs_ino(BTRFS_I(inode)), + bv.bv_offset); + } else { + int ret; - __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) { - unsigned int i, nr_sectors, pgoff; - - nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len); - pgoff = bvec.bv_offset; - for (i = 0; i < nr_sectors; i++) { - u64 start = bbio->file_offset + bio_offset; - - ASSERT(pgoff < PAGE_SIZE); - if (uptodate && - (!csum || !check_data_csum(inode, bbio, - bio_offset, bvec.bv_page, - pgoff, start))) { - clean_io_failure(fs_info, failure_tree, io_tree, - start, bvec.bv_page, - btrfs_ino(BTRFS_I(inode)), - pgoff); - } else { - int ret; - - ret = btrfs_repair_one_sector(inode, &bbio->bio, - bio_offset, bvec.bv_page, pgoff, - start, bbio->mirror_num, - submit_dio_repair_bio); - if (ret) - err = errno_to_blk_status(ret); - } - ASSERT(bio_offset + sectorsize > bio_offset); - bio_offset += sectorsize; - pgoff += sectorsize; + ret = btrfs_repair_one_sector(inode, bbio, offset, + bv.bv_page, bv.bv_offset, + submit_dio_repair_bio); + if (ret) + err = errno_to_blk_status(ret); } } - return err; -} -static void __endio_write_update_ordered(struct btrfs_inode *inode, - const u64 offset, const u64 bytes, - const bool uptodate) -{ - btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, - finish_ordered_fn, uptodate); + return err; } static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode, @@ -7869,7 +8043,7 @@ static void btrfs_end_dio_bio(struct bio *bio) err = btrfs_check_read_dio_bio(dip, bbio, !err); if (err) - dip->dio_bio->bi_status = err; + dip->bio.bi_status = err; btrfs_record_physical_zoned(dip->inode, bbio->file_offset, bio); @@ -7877,97 +8051,55 @@ static void btrfs_end_dio_bio(struct bio *bio) btrfs_dio_private_put(dip); } -static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio, - struct inode *inode, u64 file_offset, int async_submit) +static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, + u64 file_offset, int async_submit) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_dio_private *dip = bio->bi_private; - bool write = btrfs_op(bio) == BTRFS_MAP_WRITE; blk_status_t ret; - /* Check btrfs_submit_bio_hook() for rules about async submit. */ - if (async_submit) - async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers); - - if (!write) { - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) - goto err; - } + /* Save the original iter for read repair */ + if (btrfs_op(bio) == BTRFS_MAP_READ) + btrfs_bio(bio)->iter = bio->bi_iter; if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) goto map; - if (write && async_submit) { - ret = btrfs_wq_submit_bio(inode, bio, 0, 0, file_offset, - btrfs_submit_bio_start_direct_io); - goto err; - } else if (write) { + if (btrfs_op(bio) == BTRFS_MAP_WRITE) { + /* Check btrfs_submit_data_write_bio() for async submit rules */ + if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) && + btrfs_wq_submit_bio(inode, bio, 0, file_offset, + btrfs_submit_bio_start_direct_io)) + return; + /* * If we aren't doing async submit, calculate the csum of the * bio now. */ ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false); - if (ret) - goto err; + if (ret) { + bio->bi_status = ret; + bio_endio(bio); + return; + } } else { - u64 csum_offset; - - csum_offset = file_offset - dip->file_offset; - csum_offset >>= fs_info->sectorsize_bits; - csum_offset *= fs_info->csum_size; - btrfs_bio(bio)->csum = dip->csums + csum_offset; + btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums, + file_offset - dip->file_offset); } map: - ret = btrfs_map_bio(fs_info, bio, 0); -err: - return ret; -} - -/* - * If this succeeds, the btrfs_dio_private is responsible for cleaning up locked - * or ordered extents whether or not we submit any bios. - */ -static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, - struct inode *inode, - loff_t file_offset) -{ - const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); - const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); - size_t dip_size; - struct btrfs_dio_private *dip; - - dip_size = sizeof(*dip); - if (!write && csum) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - size_t nblocks; - - nblocks = dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits; - dip_size += fs_info->csum_size * nblocks; - } - - dip = kzalloc(dip_size, GFP_NOFS); - if (!dip) - return NULL; - - dip->inode = inode; - dip->file_offset = file_offset; - dip->bytes = dio_bio->bi_iter.bi_size; - dip->disk_bytenr = dio_bio->bi_iter.bi_sector << 9; - dip->dio_bio = dio_bio; - refcount_set(&dip->refs, 1); - return dip; + btrfs_submit_bio(fs_info, bio, 0); } static void btrfs_submit_direct(const struct iomap_iter *iter, struct bio *dio_bio, loff_t file_offset) { + struct btrfs_dio_private *dip = + container_of(dio_bio, struct btrfs_dio_private, bio); struct inode *inode = iter->inode; const bool write = (btrfs_op(dio_bio) == BTRFS_MAP_WRITE); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const bool raid56 = (btrfs_data_alloc_profile(fs_info) & BTRFS_BLOCK_GROUP_RAID56_MASK); - struct btrfs_dio_private *dip; struct bio *bio; u64 start_sector; int async_submit = 0; @@ -7978,27 +8110,28 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, int ret; blk_status_t status; struct btrfs_io_geometry geom; - struct btrfs_dio_data *dio_data = iter->iomap.private; + struct btrfs_dio_data *dio_data = iter->private; struct extent_map *em = NULL; - dip = btrfs_create_dio_private(dio_bio, inode, file_offset); - if (!dip) { - if (!write) { - unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, - file_offset + dio_bio->bi_iter.bi_size - 1); - } - dio_bio->bi_status = BLK_STS_RESOURCE; - bio_endio(dio_bio); - return; - } + dip->inode = inode; + dip->file_offset = file_offset; + dip->bytes = dio_bio->bi_iter.bi_size; + refcount_set(&dip->refs, 1); + dip->csums = NULL; + + if (!write && !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + unsigned int nr_sectors = + (dio_bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (!write) { /* * Load the csums up front to reduce csum tree searches and * contention when submitting bios. - * - * If we have csums disabled this will do nothing. */ + status = BLK_STS_RESOURCE; + dip->csums = kcalloc(nr_sectors, fs_info->csum_size, GFP_NOFS); + if (!dip) + goto out_err; + status = btrfs_lookup_bio_sums(inode, dio_bio, dip->csums); if (status != BLK_STS_OK) goto out_err; @@ -8067,14 +8200,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, async_submit = 1; } - status = btrfs_submit_dio_bio(bio, inode, file_offset, - async_submit); - if (status) { - bio_put(bio); - if (submit_len > 0) - refcount_dec(&dip->refs); - goto out_err_em; - } + btrfs_submit_dio_bio(bio, inode, file_offset, async_submit); dio_data->submitted += clone_len; clone_offset += clone_len; @@ -8088,19 +8214,29 @@ static void btrfs_submit_direct(const struct iomap_iter *iter, out_err_em: free_extent_map(em); out_err: - dip->dio_bio->bi_status = status; + dio_bio->bi_status = status; btrfs_dio_private_put(dip); } -const struct iomap_ops btrfs_dio_iomap_ops = { +static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, }; -const struct iomap_dio_ops btrfs_dio_ops = { +static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, + .bio_set = &btrfs_dio_bioset, }; +ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before) +{ + struct btrfs_dio_data data; + + return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC, + &data, done_before); +} + static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { @@ -8113,52 +8249,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } -int btrfs_readpage(struct file *file, struct page *page) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - struct btrfs_bio_ctrl bio_ctrl = { 0 }; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = btrfs_do_readpage(page, NULL, &bio_ctrl, 0, NULL); - if (bio_ctrl.bio) { - int ret2; - - ret2 = submit_one_bio(bio_ctrl.bio, 0, bio_ctrl.bio_flags); - if (ret == 0) - ret = ret2; - } - return ret; -} - -static int btrfs_writepage(struct page *page, struct writeback_control *wbc) -{ - struct inode *inode = page->mapping->host; - int ret; - - if (current->flags & PF_MEMALLOC) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - return 0; - } - - /* - * If we are under memory pressure we will call this directly from the - * VM, we need to make sure we have the inode referenced for the ordered - * extent. If not just return like we didn't do anything. - */ - if (!igrab(inode)) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - ret = extent_write_full_page(page, wbc); - btrfs_add_delayed_iput(inode); - return ret; -} - static int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc) { @@ -8171,7 +8261,7 @@ static void btrfs_readahead(struct readahead_control *rac) } /* - * For releasepage() and invalidate_folio() we have a race window where + * For release_folio() and invalidate_folio() we have a race window where * folio_end_writeback() is called but the subpage spinlock is not yet released. * If we continue to release/invalidate the page, we could cause use-after-free * for subpage spinlock. So this function is to spin and wait for subpage @@ -8182,7 +8272,7 @@ static void wait_subpage_spinlock(struct page *page) struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -8203,49 +8293,43 @@ static void wait_subpage_spinlock(struct page *page) spin_unlock_irq(&subpage->lock); } -static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - int ret = try_release_extent_mapping(page, gfp_flags); + int ret = try_release_extent_mapping(&folio->page, gfp_flags); if (ret == 1) { - wait_subpage_spinlock(page); - clear_page_extent_mapped(page); + wait_subpage_spinlock(&folio->page); + clear_page_extent_mapped(&folio->page); } return ret; } -static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) { - if (PageWriteback(page) || PageDirty(page)) - return 0; - return __btrfs_releasepage(page, gfp_flags); + if (folio_test_writeback(folio) || folio_test_dirty(folio)) + return false; + return __btrfs_release_folio(folio, gfp_flags); } #ifdef CONFIG_MIGRATION -static int btrfs_migratepage(struct address_space *mapping, - struct page *newpage, struct page *page, +static int btrfs_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, enum migrate_mode mode) { - int ret; + int ret = filemap_migrate_folio(mapping, dst, src, mode); - ret = migrate_page_move_mapping(mapping, newpage, page, 0); if (ret != MIGRATEPAGE_SUCCESS) return ret; - if (page_has_private(page)) - attach_page_private(newpage, detach_page_private(page)); - - if (PageOrdered(page)) { - ClearPageOrdered(page); - SetPageOrdered(newpage); + if (folio_test_ordered(src)) { + folio_clear_ordered(src); + folio_set_ordered(dst); } - if (mode != MIGRATE_SYNC_NO_COPY) - migrate_page_copy(newpage, page); - else - migrate_page_states(newpage, page); return MIGRATEPAGE_SUCCESS; } +#else +#define btrfs_migrate_folio NULL #endif static void btrfs_invalidate_folio(struct folio *folio, size_t offset, @@ -8289,7 +8373,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * still safe to wait for ordered extent to finish. */ if (!(offset == 0 && length == folio_size(folio))) { - btrfs_releasepage(&folio->page, GFP_NOFS); + btrfs_release_folio(folio, GFP_NOFS); return; } @@ -8413,7 +8497,7 @@ next: ASSERT(!folio_test_ordered(folio)); btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); if (!inode_evicting) - __btrfs_releasepage(&folio->page, GFP_NOFS); + __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); } @@ -8462,7 +8546,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * Reserving delalloc space after obtaining the page lock can lead to * deadlock. For example, if a dirty page is locked by this function * and the call to btrfs_delalloc_reserve_space() ends up triggering - * dirty page write out, then the btrfs_writepage() function could + * dirty page write out, then the btrfs_writepages() function could * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ @@ -8553,10 +8637,9 @@ again: else zero_start = PAGE_SIZE; - if (zero_start != PAGE_SIZE) { + if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - flush_dcache_page(page); - } + btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); @@ -8639,7 +8722,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback) if (!rsv) return -ENOMEM; rsv->size = min_size; - rsv->failfast = 1; + rsv->failfast = true; /* * 1 for the truncate slack space @@ -8764,46 +8847,23 @@ out: return ret; } -/* - * create a new subvolume directory/inode (helper for the ioctl). - */ -int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, - struct btrfs_root *new_root, - struct btrfs_root *parent_root, - struct user_namespace *mnt_userns) +struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns, + struct inode *dir) { struct inode *inode; - int err; - u64 index = 0; - u64 ino; - - err = btrfs_get_free_objectid(new_root, &ino); - if (err < 0) - return err; - - inode = btrfs_new_inode(trans, new_root, mnt_userns, NULL, "..", 2, - ino, ino, - S_IFDIR | (~current_umask() & S_IRWXUGO), - &index); - if (IS_ERR(inode)) - return PTR_ERR(inode); - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; - - set_nlink(inode, 1); - btrfs_i_size_write(BTRFS_I(inode), 0); - unlock_new_inode(inode); - err = btrfs_subvol_inherit_props(trans, new_root, parent_root); - if (err) - btrfs_err(new_root->fs_info, - "error inheriting subvolume %llu properties: %d", - new_root->root_key.objectid, err); - - err = btrfs_update_inode(trans, new_root, BTRFS_I(inode)); - - iput(inode); - return err; + inode = new_inode(dir->i_sb); + if (inode) { + /* + * Subvolumes don't inherit the sgid bit or the parent's gid if + * the parent's sgid bit is set. This is probably a bug. + */ + inode_init_owner(mnt_userns, inode, NULL, + S_IFDIR | (~current_umask() & S_IRWXUGO)); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + } + return inode; } struct inode *btrfs_alloc_inode(struct super_block *sb) @@ -8943,7 +9003,7 @@ int btrfs_drop_inode(struct inode *inode) static void init_once(void *foo) { - struct btrfs_inode *ei = (struct btrfs_inode *) foo; + struct btrfs_inode *ei = foo; inode_init_once(&ei->vfs_inode); } @@ -8955,6 +9015,7 @@ void __cold btrfs_destroy_cachep(void) * destroy cache. */ rcu_barrier(); + bioset_exit(&btrfs_dio_bioset); kmem_cache_destroy(btrfs_inode_cachep); kmem_cache_destroy(btrfs_trans_handle_cachep); kmem_cache_destroy(btrfs_path_cachep); @@ -8995,6 +9056,11 @@ int __init btrfs_init_cachep(void) if (!btrfs_free_space_bitmap_cachep) goto fail; + if (bioset_init(&btrfs_dio_bioset, BIO_POOL_SIZE, + offsetof(struct btrfs_dio_private, bio), + BIOSET_NEED_BVECS)) + goto fail; + return 0; fail: btrfs_destroy_cachep(); @@ -9050,6 +9116,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); struct btrfs_trans_handle *trans; + unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; struct btrfs_root *dest = BTRFS_I(new_dir)->root; struct inode *new_inode = new_dentry->d_inode; @@ -9081,14 +9148,37 @@ static int btrfs_rename_exchange(struct inode *old_dir, down_read(&fs_info->subvol_sem); /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume their normal - * inodes. So 5 * 2 is 10, plus 2 for the new links, so 12 total items - * should cover the worst case number of items we'll modify. + * For each inode: + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + * 1 to update parent inode + * + * If the parents are the same, we only need to account for one */ - trans = btrfs_start_transaction(root, 12); + trans_num_items = (old_dir == new_dir ? 9 : 10); + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode item + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } + if (new_ino == BTRFS_FIRST_FREE_OBJECTID) + trans_num_items += 4; + else + trans_num_items += 3; + trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); goto out_notrans; @@ -9153,8 +9243,10 @@ static int btrfs_rename_exchange(struct inode *old_dir, inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); inode_inc_iversion(new_inode); - old_dir->i_ctime = old_dir->i_mtime = ctime; - new_dir->i_ctime = new_dir->i_mtime = ctime; + old_dir->i_mtime = ctime; + old_dir->i_ctime = ctime; + new_dir->i_mtime = ctime; + new_dir->i_ctime = ctime; old_inode->i_ctime = ctime; new_inode->i_ctime = ctime; @@ -9255,56 +9347,19 @@ out_notrans: return ret; } -static int btrfs_whiteout_for_rename(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct user_namespace *mnt_userns, - struct inode *dir, - struct dentry *dentry) +static struct inode *new_whiteout_inode(struct user_namespace *mnt_userns, + struct inode *dir) { - int ret; struct inode *inode; - u64 objectid; - u64 index; - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - return ret; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, - dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), - objectid, - S_IFCHR | WHITEOUT_MODE, - &index); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - return ret; + inode = new_inode(dir->i_sb); + if (inode) { + inode_init_owner(mnt_userns, inode, dir, + S_IFCHR | WHITEOUT_MODE); + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); } - - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, - WHITEOUT_DEV); - - ret = btrfs_init_inode_security(trans, inode, dir, - &dentry->d_name); - if (ret) - goto out; - - ret = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); -out: - unlock_new_inode(inode); - if (ret) - inode_dec_link_count(inode); - iput(inode); - - return ret; + return inode; } static int btrfs_rename(struct user_namespace *mnt_userns, @@ -9313,6 +9368,10 @@ static int btrfs_rename(struct user_namespace *mnt_userns, unsigned int flags) { struct btrfs_fs_info *fs_info = btrfs_sb(old_dir->i_sb); + struct btrfs_new_inode_args whiteout_args = { + .dir = old_dir, + .dentry = old_dentry, + }; struct btrfs_trans_handle *trans; unsigned int trans_num_items; struct btrfs_root *root = BTRFS_I(old_dir)->root; @@ -9367,23 +9426,56 @@ static int btrfs_rename(struct user_namespace *mnt_userns, if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size) filemap_flush(old_inode->i_mapping); - /* close the racy window with snapshot create/destroy ioctl */ - if (old_ino == BTRFS_FIRST_FREE_OBJECTID) + if (flags & RENAME_WHITEOUT) { + whiteout_args.inode = new_whiteout_inode(mnt_userns, old_dir); + if (!whiteout_args.inode) + return -ENOMEM; + ret = btrfs_new_inode_prepare(&whiteout_args, &trans_num_items); + if (ret) + goto out_whiteout_inode; + } else { + /* 1 to update the old parent inode. */ + trans_num_items = 1; + } + + if (old_ino == BTRFS_FIRST_FREE_OBJECTID) { + /* Close the race window with snapshot create/destroy ioctl */ down_read(&fs_info->subvol_sem); + /* + * 1 to remove old root ref + * 1 to remove old root backref + * 1 to add new root ref + * 1 to add new root backref + */ + trans_num_items += 4; + } else { + /* + * 1 to update inode + * 1 to remove old inode ref + * 1 to add new inode ref + */ + trans_num_items += 3; + } /* - * We want to reserve the absolute worst case amount of items. So if - * both inodes are subvols and we need to unlink them then that would - * require 4 item modifications, but if they are both normal inodes it - * would require 5 item modifications, so we'll assume they are normal - * inodes. So 5 * 2 is 10, plus 1 for the new link, so 11 total items - * should cover the worst case number of items we'll modify. - * If our rename has the whiteout flag, we need more 5 units for the - * new inode (1 inode item, 1 inode ref, 2 dir items and 1 xattr item - * when selinux is enabled). - */ - trans_num_items = 11; - if (flags & RENAME_WHITEOUT) + * 1 to remove old dir item + * 1 to remove old dir index + * 1 to add new dir item + * 1 to add new dir index + */ + trans_num_items += 4; + /* 1 to update new parent inode if it's not the same as the old parent */ + if (new_dir != old_dir) + trans_num_items++; + if (new_inode) { + /* + * 1 to update inode + * 1 to remove inode ref + * 1 to remove dir item + * 1 to remove dir index + * 1 to possibly add orphan item + */ trans_num_items += 5; + } trans = btrfs_start_transaction(root, trans_num_items); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -9417,9 +9509,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns, inode_inc_iversion(old_dir); inode_inc_iversion(new_dir); inode_inc_iversion(old_inode); - old_dir->i_ctime = old_dir->i_mtime = - new_dir->i_ctime = new_dir->i_mtime = - old_inode->i_ctime = current_time(old_dir); + old_dir->i_mtime = current_time(old_dir); + old_dir->i_ctime = old_dir->i_mtime; + new_dir->i_mtime = old_dir->i_mtime; + new_dir->i_ctime = old_dir->i_mtime; + old_inode->i_ctime = old_dir->i_mtime; if (old_dentry->d_parent != new_dentry->d_parent) btrfs_record_unlink_dir(trans, BTRFS_I(old_dir), @@ -9479,12 +9573,14 @@ static int btrfs_rename(struct user_namespace *mnt_userns, rename_ctx.index, new_dentry->d_parent); if (flags & RENAME_WHITEOUT) { - ret = btrfs_whiteout_for_rename(trans, root, mnt_userns, - old_dir, old_dentry); - + ret = btrfs_create_new_inode(trans, &whiteout_args); if (ret) { btrfs_abort_transaction(trans, ret); goto out_fail; + } else { + unlock_new_inode(whiteout_args.inode); + iput(whiteout_args.inode); + whiteout_args.inode = NULL; } } out_fail: @@ -9493,7 +9589,11 @@ out_fail: out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - + if (flags & RENAME_WHITEOUT) + btrfs_new_inode_args_destroy(&whiteout_args); +out_whiteout_inode: + if (flags & RENAME_WHITEOUT) + iput(whiteout_args.inode); return ret; } @@ -9501,15 +9601,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { + int ret; + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) - return btrfs_rename_exchange(old_dir, old_dentry, new_dir, - new_dentry); + ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir, + new_dentry); + else + ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, + new_dentry, flags); + + btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info); - return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir, - new_dentry, flags); + return ret; } struct btrfs_delalloc_work { @@ -9712,10 +9818,13 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_path *path; struct btrfs_key key; - struct inode *inode = NULL; + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + }; + unsigned int trans_num_items; int err; - u64 objectid; - u64 index = 0; int name_len; int datasize; unsigned long ptr; @@ -9726,49 +9835,40 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return -ENAMETOOLONG; - /* - * 2 items for inode item and ref - * 2 items for dir items - * 1 item for updating parent inode item - * 1 item for the inline extent item - * 1 item for xattr if selinux is on - */ - trans = btrfs_start_transaction(root, 7); - if (IS_ERR(trans)) - return PTR_ERR(trans); + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, S_IFLNK | S_IRWXUGO); + inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &btrfs_aops; + btrfs_i_size_write(BTRFS_I(inode), name_len); + inode_set_bytes(inode, name_len); - err = btrfs_get_free_objectid(root, &objectid); + new_inode_args.inode = inode; + err = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (err) - goto out_unlock; + goto out_inode; + /* 1 additional item for the inline extent */ + trans_num_items++; - inode = btrfs_new_inode(trans, root, mnt_userns, dir, - dentry->d_name.name, dentry->d_name.len, - btrfs_ino(BTRFS_I(dir)), objectid, - S_IFLNK | S_IRWXUGO, &index); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - inode = NULL; - goto out_unlock; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_new_inode_args; } - /* - * If the active LSM wants to access the inode during - * d_instantiate it needs these. Smack checks to see - * if the filesystem supports xattrs by looking at the - * ops vector. - */ - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + err = btrfs_create_new_inode(trans, &new_inode_args); if (err) - goto out_unlock; + goto out; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - goto out_unlock; + btrfs_abort_transaction(trans, err); + discard_new_inode(inode); + inode = NULL; + goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); key.offset = 0; @@ -9777,8 +9877,11 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { + btrfs_abort_transaction(trans, err); btrfs_free_path(path); - goto out_unlock; + discard_new_inode(inode); + inode = NULL; + goto out; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -9796,31 +9899,16 @@ static int btrfs_symlink(struct user_namespace *mnt_userns, struct inode *dir, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode_set_bytes(inode, name_len); - btrfs_i_size_write(BTRFS_I(inode), name_len); - err = btrfs_update_inode(trans, root, BTRFS_I(inode)); - /* - * Last step, add directory indexes for our symlink inode. This is the - * last step to avoid extra cleanup of these indexes if an error happens - * elsewhere above. - */ - if (!err) - err = btrfs_add_nondir(trans, BTRFS_I(dir), dentry, - BTRFS_I(inode), 0, index); - if (err) - goto out_unlock; - d_instantiate_new(dentry, inode); - -out_unlock: + err = 0; +out: btrfs_end_transaction(trans); - if (err && inode) { - inode_dec_link_count(inode); - discard_new_inode(inode); - } btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (err) + iput(inode); return err; } @@ -9869,6 +9957,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; extent_info.is_new_extent = true; + extent_info.update_times = true; extent_info.qgroup_reserved = qgroup_released; extent_info.insertions = 0; @@ -10071,62 +10160,58 @@ static int btrfs_tmpfile(struct user_namespace *mnt_userns, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(dir)->root; - struct inode *inode = NULL; - u64 objectid; - u64 index; - int ret = 0; - - /* - * 5 units required for adding orphan entry - */ - trans = btrfs_start_transaction(root, 5); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_get_free_objectid(root, &objectid); - if (ret) - goto out; - - inode = btrfs_new_inode(trans, root, mnt_userns, dir, NULL, 0, - btrfs_ino(BTRFS_I(dir)), objectid, mode, &index); - if (IS_ERR(inode)) { - ret = PTR_ERR(inode); - inode = NULL; - goto out; - } + struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .orphan = true, + }; + unsigned int trans_num_items; + int ret; + inode = new_inode(dir->i_sb); + if (!inode) + return -ENOMEM; + inode_init_owner(mnt_userns, inode, dir, mode); inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; - inode->i_mapping->a_ops = &btrfs_aops; - ret = btrfs_init_inode_security(trans, inode, dir, NULL); + new_inode_args.inode = inode; + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); if (ret) - goto out; + goto out_inode; - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - if (ret) - goto out; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); - if (ret) - goto out; + trans = btrfs_start_transaction(root, trans_num_items); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out_new_inode_args; + } + + ret = btrfs_create_new_inode(trans, &new_inode_args); /* - * We set number of links to 0 in btrfs_new_inode(), and here we set - * it to 1 because d_tmpfile() will issue a warning if the count is 0, - * through: + * We set number of links to 0 in btrfs_create_new_inode(), and here we + * set it to 1 because d_tmpfile() will issue a warning if the count is + * 0, through: * * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); - d_tmpfile(dentry, inode); - unlock_new_inode(inode); - mark_inode_dirty(inode); -out: + + if (!ret) { + d_tmpfile(dentry, inode); + unlock_new_inode(inode); + mark_inode_dirty(inode); + } + btrfs_end_transaction(trans); - if (ret && inode) - discard_new_inode(inode); btrfs_btree_balance_dirty(fs_info); +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + if (ret) + iput(inode); return ret; } @@ -10150,9 +10235,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) } } -static int btrfs_encoded_io_compression_from_extent( - struct btrfs_fs_info *fs_info, - int compress_type) +int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info, + int compress_type) { switch (compress_type) { case BTRFS_COMPRESS_NONE: @@ -10275,7 +10359,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num) { struct btrfs_encoded_read_private *priv = bio->bi_private; - struct btrfs_bio *bbio = btrfs_bio(bio); struct btrfs_fs_info *fs_info = inode->root->fs_info; blk_status_t ret; @@ -10285,19 +10368,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode, return ret; } - ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA); - if (ret) { - btrfs_bio_free_csum(bbio); - return ret; - } - atomic_inc(&priv->pending); - ret = btrfs_map_bio(fs_info, bio, mirror_num); - if (ret) { - atomic_dec(&priv->pending); - btrfs_bio_free_csum(bbio); - } - return ret; + btrfs_submit_bio(fs_info, bio, mirror_num); + return BLK_STS_OK; } static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) @@ -10309,7 +10382,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) u32 sectorsize = fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; - u64 start = priv->file_offset; u32 bio_offset = 0; if (priv->skip_csum || !uptodate) @@ -10322,10 +10394,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio) pgoff = bvec->bv_offset; for (i = 0; i < nr_sectors; i++) { ASSERT(pgoff < PAGE_SIZE); - if (check_data_csum(&inode->vfs_inode, bbio, bio_offset, - bvec->bv_page, pgoff, start)) + if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset, + bvec->bv_page, pgoff)) return BLK_STS_IOERR; - start += sectorsize; bio_offset += sectorsize; pgoff += sectorsize; } @@ -10357,11 +10428,9 @@ static void btrfs_encoded_read_endio(struct bio *bio) bio_put(bio); } -static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, - u64 file_offset, - u64 disk_bytenr, - u64 disk_io_size, - struct page **pages) +int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, + u64 file_offset, u64 disk_bytenr, + u64 disk_io_size, struct page **pages) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_encoded_read_private priv = { @@ -10458,13 +10527,11 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) { - ret = -ENOMEM; - goto out; + ret = btrfs_alloc_page_array(nr_pages, pages); + if (ret) { + ret = -ENOMEM; + goto out; } - } ret = btrfs_encoded_read_regular_fill_pages(inode, start, disk_bytenr, disk_io_size, pages); @@ -10594,7 +10661,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, ret = -ENOBUFS; goto out_em; } - disk_io_size = count = em->block_len; + disk_io_size = em->block_len; + count = em->block_len; encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, @@ -10757,15 +10825,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = -ENOMEM; goto out_pages; } - kaddr = kmap(pages[i]); + kaddr = kmap_local_page(pages[i]); if (copy_from_iter(kaddr, bytes, from) != bytes) { - kunmap(pages[i]); + kunmap_local(kaddr); ret = -EFAULT; goto out_pages; } if (bytes < PAGE_SIZE) memset(kaddr + bytes, 0, PAGE_SIZE - bytes); - kunmap(pages[i]); + kunmap_local(kaddr); } for (;;) { @@ -10800,7 +10868,8 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = btrfs_qgroup_reserve_data(inode, &data_reserved, start, num_bytes); if (ret) goto out_free_data_space; - ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes); + ret = btrfs_delalloc_reserve_metadata(inode, num_bytes, disk_num_bytes, + false); if (ret) goto out_qgroup_free_data; @@ -11309,6 +11378,41 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, spin_unlock(&inode->lock); } +/** + * Verify that there are no ordered extents for a given file range. + * + * @inode: The target inode. + * @start: Start offset of the file range, should be sector size aligned. + * @end: End offset (inclusive) of the file range, its value +1 should be + * sector size aligned. + * + * This should typically be used for cases where we locked an inode's VFS lock in + * exclusive mode, we have also locked the inode's i_mmap_lock in exclusive mode, + * we have flushed all delalloc in the range, we have waited for all ordered + * extents in the range to complete and finally we have locked the file range in + * the inode's io_tree. + */ +void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = inode->root; + struct btrfs_ordered_extent *ordered; + + if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) + return; + + ordered = btrfs_lookup_first_ordered_range(inode, start, end + 1 - start); + if (ordered) { + btrfs_err(root->fs_info, +"found unexpected ordered extent in file range [%llu, %llu] for inode %llu root %llu (ordered range [%llu, %llu])", + start, end, btrfs_ino(inode), root->root_key.objectid, + ordered->file_offset, + ordered->file_offset + ordered->num_bytes - 1); + btrfs_put_ordered_extent(ordered); + } + + ASSERT(ordered == NULL); +} + static const struct inode_operations btrfs_dir_inode_operations = { .getattr = btrfs_getattr, .lookup = btrfs_lookup, @@ -11357,16 +11461,13 @@ static const struct file_operations btrfs_dir_file_operations = { * For now we're avoiding this by dropping bmap. */ static const struct address_space_operations btrfs_aops = { - .readpage = btrfs_readpage, - .writepage = btrfs_writepage, + .read_folio = btrfs_read_folio, .writepages = btrfs_writepages, .readahead = btrfs_readahead, .direct_IO = noop_direct_IO, .invalidate_folio = btrfs_invalidate_folio, - .releasepage = btrfs_releasepage, -#ifdef CONFIG_MIGRATION - .migratepage = btrfs_migratepage, -#endif + .release_folio = btrfs_release_folio, + .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, .error_remove_page = generic_error_remove_page, .swap_activate = btrfs_swap_activate, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b2c692b2fd8d..fe0cc816b4eb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -540,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid) return 1; } +/* + * Calculate the number of transaction items to reserve for creating a subvolume + * or snapshot, not including the inode, directory entries, or parent directory. + */ +static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +{ + /* + * 1 to add root block + * 1 to add root item + * 1 to add root ref + * 1 to add root backref + * 1 to add UUID item + * 1 to add qgroup info + * 1 to add qgroup limit + * + * Ideally the last two would only be accounted if qgroups are enabled, + * but that can change between now and the time we would insert them. + */ + unsigned int num_items = 7; + + if (inherit) { + /* 2 to add qgroup relations for each inherited qgroup */ + num_items += 2 * inherit->num_qgroups; + } + return num_items; +} + static noinline int create_subvol(struct user_namespace *mnt_userns, struct inode *dir, struct dentry *dentry, - const char *name, int namelen, struct btrfs_qgroup_inherit *inherit) { struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); @@ -555,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, struct btrfs_root *new_root; struct btrfs_block_rsv block_rsv; struct timespec64 cur_time = current_time(dir); - struct inode *inode; + struct btrfs_new_inode_args new_inode_args = { + .dir = dir, + .dentry = dentry, + .subvol = true, + }; + unsigned int trans_num_items; int ret; - dev_t anon_dev = 0; + dev_t anon_dev; u64 objectid; - u64 index = 0; root_item = kzalloc(sizeof(*root_item), GFP_KERNEL); if (!root_item) @@ -567,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid); if (ret) - goto fail_free; - - ret = get_anon_bdev(&anon_dev); - if (ret < 0) - goto fail_free; + goto out_root_item; /* * Don't create subvolume whose level is not zero. Or qgroup will be @@ -579,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, */ if (btrfs_qgroup_level(objectid)) { ret = -ENOSPC; - goto fail_free; + goto out_root_item; } + ret = get_anon_bdev(&anon_dev); + if (ret < 0) + goto out_root_item; + + new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir); + if (!new_inode_args.inode) { + ret = -ENOMEM; + goto out_anon_dev; + } + ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items); + if (ret) + goto out_inode; + trans_num_items += create_subvol_num_items(inherit); + btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP); - /* - * The same as the snapshot creation, please see the comment - * of create_snapshot(). - */ - ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false); + ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, + trans_num_items, false); if (ret) - goto fail_free; + goto out_new_inode_args; trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_subvolume_release_metadata(root, &block_rsv); - goto fail_free; + goto out_new_inode_args; } trans->block_rsv = &block_rsv; trans->bytes_reserved = block_rsv.size; ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit); if (ret) - goto fail; + goto out; leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); - goto fail; + goto out; } btrfs_mark_buffer_dirty(leaf); @@ -663,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns, btrfs_tree_unlock(leaf); btrfs_free_tree_block(trans, objectid, leaf, 0, 1); free_extent_buffer(leaf); - goto fail; + goto out; } free_extent_buffer(leaf); leaf = NULL; - key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); if (IS_ERR(new_root)) { - free_anon_bdev(anon_dev); ret = PTR_ERR(new_root); btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - /* Freeing will be done in btrfs_put_root() of new_root */ + /* anon_dev is owned by new_root now. */ anon_dev = 0; + BTRFS_I(new_inode_args.inode)->root = new_root; + /* ... and new_root is owned by new_inode_args.inode now. */ ret = btrfs_record_root_in_trans(trans, new_root); if (ret) { - btrfs_put_root(new_root); - btrfs_abort_transaction(trans, ret); - goto fail; - } - - ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns); - btrfs_put_root(new_root); - if (ret) { - /* We potentially lose an unused inode item here */ - btrfs_abort_transaction(trans, ret); - goto fail; - } - - /* - * insert the directory item - */ - ret = btrfs_set_inode_index(BTRFS_I(dir), &index); - if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key, - BTRFS_FT_DIR, index); + ret = btrfs_uuid_tree_add(trans, root_item->uuid, + BTRFS_UUID_KEY_SUBVOL, objectid); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2); - ret = btrfs_update_inode(trans, root, BTRFS_I(dir)); + ret = btrfs_create_new_inode(trans, &new_inode_args); if (ret) { btrfs_abort_transaction(trans, ret); - goto fail; + goto out; } - ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid, - btrfs_ino(BTRFS_I(dir)), index, name, namelen); - if (ret) { - btrfs_abort_transaction(trans, ret); - goto fail; - } + d_instantiate_new(dentry, new_inode_args.inode); + new_inode_args.inode = NULL; - ret = btrfs_uuid_tree_add(trans, root_item->uuid, - BTRFS_UUID_KEY_SUBVOL, objectid); - if (ret) - btrfs_abort_transaction(trans, ret); - -fail: - kfree(root_item); +out: trans->block_rsv = NULL; trans->bytes_reserved = 0; btrfs_subvolume_release_metadata(root, &block_rsv); @@ -740,18 +748,14 @@ fail: btrfs_end_transaction(trans); else ret = btrfs_commit_transaction(trans); - - if (!ret) { - inode = btrfs_lookup_dentry(dir, dentry); - if (IS_ERR(inode)) - return PTR_ERR(inode); - d_instantiate(dentry, inode); - } - return ret; - -fail_free: +out_new_inode_args: + btrfs_new_inode_args_destroy(&new_inode_args); +out_inode: + iput(new_inode_args.inode); +out_anon_dev: if (anon_dev) free_anon_bdev(anon_dev); +out_root_item: kfree(root_item); return ret; } @@ -763,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; + unsigned int trans_num_items; struct btrfs_trans_handle *trans; int ret; @@ -800,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* - * 1 - parent dir inode - * 2 - dir entries - * 1 - root item - * 2 - root ref/backref - * 1 - root of snapshot - * 1 - UUID item + * 1 to add dir item + * 1 to add dir index + * 1 to update parent inode item */ + trans_num_items = create_subvol_num_items(inherit) + 3; ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, - &pending_snapshot->block_rsv, 8, - false); + &pending_snapshot->block_rsv, + trans_num_items, false); if (ret) goto free_pending; @@ -979,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (snap_src) error = create_snapshot(snap_src, dir, dentry, readonly, inherit); else - error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit); + error = create_subvol(mnt_userns, dir, dentry, inherit); if (!error) fsnotify_mkdir(dir, dentry); @@ -1227,16 +1230,18 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, return em; } -static u32 get_extent_max_capacity(const struct extent_map *em) +static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, + const struct extent_map *em) { if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) return BTRFS_MAX_COMPRESSED; - return BTRFS_MAX_EXTENT_SIZE; + return fs_info->max_extent_size; } static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, u32 extent_thresh, u64 newer_than, bool locked) { + struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *next; bool ret = false; @@ -1260,7 +1265,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, * If the next extent is at its max capacity, defragging current extent * makes no sense, as the total number of extents won't change. */ - if (next->len >= get_extent_max_capacity(em)) + if (next->len >= get_extent_max_capacity(fs_info, em)) goto out; /* Skip older extent */ if (next->generation < newer_than) @@ -1355,7 +1360,7 @@ again: * make it uptodate. */ if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (page->mapping != mapping || !PagePrivate(page)) { unlock_page(page); @@ -1397,6 +1402,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, bool locked, struct list_head *target_list, u64 *last_scanned_ret) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; bool last_is_target = false; u64 cur = start; int ret = 0; @@ -1413,8 +1419,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode, if (!em) break; - /* Skip hole/inline/preallocated extents */ - if (em->block_start >= EXTENT_MAP_LAST_BYTE || + /* + * If the file extent is an inlined one, we may still want to + * defrag it (fallthrough) if it will cause a regular extent. + * This is for users who want to convert inline extents to + * regular ones through max_inline= mount option. + */ + if (em->block_start == EXTENT_MAP_INLINE && + em->len <= inode->root->fs_info->max_inline) + goto next; + + /* Skip hole/delalloc/preallocated extents */ + if (em->block_start == EXTENT_MAP_HOLE || + em->block_start == EXTENT_MAP_DELALLOC || test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) goto next; @@ -1470,9 +1487,18 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * Skip extents already at its max capacity, this is mostly for * compressed extents, which max cap is only 128K. */ - if (em->len >= get_extent_max_capacity(em)) + if (em->len >= get_extent_max_capacity(fs_info, em)) goto next; + /* + * Normally there are no more extents after an inline one, thus + * @next_mergeable will normally be false and not defragged. + * So if an inline extent passed all above checks, just add it + * for defrag, and be converted to regular extents. + */ + if (em->block_start == EXTENT_MAP_INLINE) + goto add; + next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em, extent_thresh, newer_than, locked); if (!next_mergeable) { @@ -2594,7 +2620,7 @@ err: static noinline int btrfs_ioctl_tree_search(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args __user *uargs; + struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; size_t buf_size; @@ -2602,8 +2628,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - uargs = (struct btrfs_ioctl_search_args __user *)argp; - if (copy_from_user(&sk, &uargs->key, sizeof(sk))) return -EFAULT; @@ -2626,7 +2650,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, void __user *argp) { - struct btrfs_ioctl_search_args_v2 __user *uarg; + struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; size_t buf_size; @@ -2636,7 +2660,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, return -EPERM; /* copy search header and buffer size */ - uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp; if (copy_from_user(&args, uarg, sizeof(args))) return -EFAULT; @@ -4223,26 +4246,6 @@ out: return ret; } -static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx) -{ - struct btrfs_data_container *inodes = ctx; - const size_t c = 3 * sizeof(u64); - - if (inodes->bytes_left >= c) { - inodes->bytes_left -= c; - inodes->val[inodes->elem_cnt] = inum; - inodes->val[inodes->elem_cnt + 1] = offset; - inodes->val[inodes->elem_cnt + 2] = root; - inodes->elem_cnt += 3; - } else { - inodes->bytes_missing += c - inodes->bytes_left; - inodes->bytes_left = 0; - inodes->elem_missed += 3; - } - - return 0; -} - static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, void __user *arg, int version) { @@ -4292,7 +4295,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info, } ret = iterate_inodes_from_logical(loi->logical, fs_info, path, - build_ino_list, inodes, ignore_offset); + inodes, ignore_offset); if (ret == -EINVAL) ret = -ENOENT; if (ret < 0) @@ -4335,19 +4338,81 @@ void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, spin_unlock(&fs_info->balance_lock); } +/** + * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as + * required. + * + * @fs_info: the filesystem + * @excl_acquired: ptr to boolean value which is set to false in case balance + * is being resumed + * + * Return 0 on success in which case both fs_info::balance is acquired as well + * as exclusive ops are blocked. In case of failure return an error code. + */ +static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired) +{ + int ret; + + /* + * Exclusive operation is locked. Three possibilities: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ + while (1) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { + *excl_acquired = true; + mutex_lock(&fs_info->balance_mutex); + return 0; + } + + mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* This is either (2) or (3) */ + if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (2) */ + ret = -EINPROGRESS; + goto out_failure; + + } else { + mutex_unlock(&fs_info->balance_mutex); + /* + * Lock released to allow other waiters to + * continue, we'll reexamine the status again. + */ + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { + /* This is (3) */ + *excl_acquired = false; + return 0; + } + } + } else { + /* This is (1) */ + ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + goto out_failure; + } + + mutex_unlock(&fs_info->balance_mutex); + } + +out_failure: + mutex_unlock(&fs_info->balance_mutex); + *excl_acquired = false; + return ret; +} + static long btrfs_ioctl_balance(struct file *file, void __user *arg) { struct btrfs_root *root = BTRFS_I(file_inode(file))->root; struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; - bool need_unlock; /* for mut. excl. ops lock */ + bool need_unlock = true; int ret; - if (!arg) - btrfs_warn(fs_info, - "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18"); - if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4355,106 +4420,55 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; -again: - if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { - mutex_lock(&fs_info->balance_mutex); - need_unlock = true; - goto locked; + bargs = memdup_user(arg, sizeof(*bargs)); + if (IS_ERR(bargs)) { + ret = PTR_ERR(bargs); + bargs = NULL; + goto out; } - /* - * mut. excl. ops lock is locked. Three possibilities: - * (1) some other op is running - * (2) balance is running - * (3) balance is paused -- special case (think resume) - */ - mutex_lock(&fs_info->balance_mutex); - if (fs_info->balance_ctl) { - /* this is either (2) or (3) */ - if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - mutex_unlock(&fs_info->balance_mutex); - /* - * Lock released to allow other waiters to continue, - * we'll reexamine the status again. - */ - mutex_lock(&fs_info->balance_mutex); - - if (fs_info->balance_ctl && - !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) { - /* this is (3) */ - need_unlock = false; - goto locked; - } - - mutex_unlock(&fs_info->balance_mutex); - goto again; - } else { - /* this is (2) */ - mutex_unlock(&fs_info->balance_mutex); - ret = -EINPROGRESS; - goto out; - } - } else { - /* this is (1) */ - mutex_unlock(&fs_info->balance_mutex); - ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; + ret = btrfs_try_lock_balance(fs_info, &need_unlock); + if (ret) goto out; - } -locked: + lockdep_assert_held(&fs_info->balance_mutex); - if (arg) { - bargs = memdup_user(arg, sizeof(*bargs)); - if (IS_ERR(bargs)) { - ret = PTR_ERR(bargs); + if (bargs->flags & BTRFS_BALANCE_RESUME) { + if (!fs_info->balance_ctl) { + ret = -ENOTCONN; goto out_unlock; } - if (bargs->flags & BTRFS_BALANCE_RESUME) { - if (!fs_info->balance_ctl) { - ret = -ENOTCONN; - goto out_bargs; - } + bctl = fs_info->balance_ctl; + spin_lock(&fs_info->balance_lock); + bctl->flags |= BTRFS_BALANCE_RESUME; + spin_unlock(&fs_info->balance_lock); + btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); - bctl = fs_info->balance_ctl; - spin_lock(&fs_info->balance_lock); - bctl->flags |= BTRFS_BALANCE_RESUME; - spin_unlock(&fs_info->balance_lock); - btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE); + goto do_balance; + } - goto do_balance; - } - } else { - bargs = NULL; + if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { + ret = -EINVAL; + goto out_unlock; } if (fs_info->balance_ctl) { ret = -EINPROGRESS; - goto out_bargs; + goto out_unlock; } bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; - goto out_bargs; - } - - if (arg) { - memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); - memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); - memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); - - bctl->flags = bargs->flags; - } else { - /* balance everything - no filters */ - bctl->flags |= BTRFS_BALANCE_TYPE_MASK; + goto out_unlock; } - if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) { - ret = -EINVAL; - goto out_bctl; - } + memcpy(&bctl->data, &bargs->data, sizeof(bctl->data)); + memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta)); + memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys)); + bctl->flags = bargs->flags; do_balance: /* * Ownership of bctl and exclusive operation goes to btrfs_balance. @@ -4467,21 +4481,19 @@ do_balance: ret = btrfs_balance(fs_info, bctl, bargs); bctl = NULL; - if ((ret == 0 || ret == -ECANCELED) && arg) { + if (ret == 0 || ret == -ECANCELED) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; } -out_bctl: kfree(bctl); -out_bargs: - kfree(bargs); out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); + kfree(bargs); return ret; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 313d9d685adb..33461b4f9c8b 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -45,7 +45,6 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting ne start_ns = ktime_get_ns(); down_read_nested(&eb->lock, nest); - eb->lock_owner = current->pid; trace_btrfs_tree_read_lock(eb, start_ns); } @@ -62,7 +61,6 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) int btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { - eb->lock_owner = current->pid; trace_btrfs_try_tree_read_lock(eb); return 1; } @@ -90,7 +88,6 @@ int btrfs_try_tree_write_lock(struct extent_buffer *eb) void btrfs_tree_read_unlock(struct extent_buffer *eb) { trace_btrfs_tree_read_unlock(eb); - eb->lock_owner = 0; up_read(&eb->lock); } diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 430ad36b8b08..89bc5f825e0a 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -155,7 +155,7 @@ static int copy_compressed_data_to_page(char *compressed_data, out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); write_compress_length(kaddr + offset_in_page(*cur_out), compressed_size); *cur_out += LZO_LEN; @@ -167,7 +167,7 @@ static int copy_compressed_data_to_page(char *compressed_data, u32 copy_len = min_t(u32, sectorsize - *cur_out % sectorsize, orig_out + compressed_size - *cur_out); - kunmap(cur_page); + kunmap_local(kaddr); if ((*cur_out / PAGE_SIZE) >= max_nr_page) return -E2BIG; @@ -180,7 +180,7 @@ static int copy_compressed_data_to_page(char *compressed_data, return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; } - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); memcpy(kaddr + offset_in_page(*cur_out), compressed_data + *cur_out - orig_out, copy_len); @@ -202,7 +202,7 @@ static int copy_compressed_data_to_page(char *compressed_data, *cur_out += sector_bytes_left; out: - kunmap(cur_page); + kunmap_local(kaddr); return 0; } @@ -248,12 +248,12 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap(page_in); + data_in = kmap_local_page(page_in); ret = lzo1x_1_compress(data_in + offset_in_page(cur_in), in_len, workspace->cbuf, &out_len, workspace->mem); - kunmap(page_in); + kunmap_local(data_in); if (ret < 0) { pr_debug("BTRFS: lzo in loop returned %d\n", ret); ret = -EIO; @@ -310,7 +310,6 @@ static void copy_compressed_segment(struct compressed_bio *cb, u32 orig_in = *cur_in; while (*cur_in < orig_in + len) { - char *kaddr; struct page *cur_page; u32 copy_len = min_t(u32, PAGE_SIZE - offset_in_page(*cur_in), orig_in + len - *cur_in); @@ -318,11 +317,8 @@ static void copy_compressed_segment(struct compressed_bio *cb, ASSERT(copy_len); cur_page = cb->compressed_pages[*cur_in / PAGE_SIZE]; - kaddr = kmap(cur_page); - memcpy(dest + *cur_in - orig_in, - kaddr + offset_in_page(*cur_in), - copy_len); - kunmap(cur_page); + memcpy_from_page(dest + *cur_in - orig_in, cur_page, + offset_in_page(*cur_in), copy_len); *cur_in += copy_len; } @@ -342,9 +338,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) /* Bytes decompressed so far */ u32 cur_out = 0; - kaddr = kmap(cb->compressed_pages[0]); + kaddr = kmap_local_page(cb->compressed_pages[0]); len_in = read_compress_length(kaddr); - kunmap(cb->compressed_pages[0]); + kunmap_local(kaddr); cur_in += LZO_LEN; /* @@ -378,9 +374,9 @@ int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb) (cur_in + LZO_LEN - 1) / sectorsize); cur_page = cb->compressed_pages[cur_in / PAGE_SIZE]; ASSERT(cur_page); - kaddr = kmap(cur_page); + kaddr = kmap_local_page(cur_page); seg_len = read_compress_length(kaddr + offset_in_page(cur_in)); - kunmap(cur_page); + kunmap_local(kaddr); cur_in += LZO_LEN; if (seg_len > WORKSPACE_CBUF_LENGTH) { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 1957b14b329a..1952ac85222c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -272,25 +272,30 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, spin_unlock_irq(&tree->lock); } +static void finish_ordered_fn(struct btrfs_work *work) +{ + struct btrfs_ordered_extent *ordered_extent; + + ordered_extent = container_of(work, struct btrfs_ordered_extent, work); + btrfs_finish_ordered_io(ordered_extent); +} + /* * Mark all ordered extents io inside the specified range finished. * - * @page: The invovled page for the opeartion. + * @page: The involved page for the operation. * For uncompressed buffered IO, the page status also needs to be * updated to indicate whether the pending ordered io is finished. * Can be NULL for direct IO and compressed write. * For these cases, callers are ensured they won't execute the * endio function twice. - * @finish_func: The function to be executed when all the IO of an ordered - * extent are finished. * * This function is called for endio, thus the range must have ordered - * extent(s) coveri it. + * extent(s) covering it. */ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, - struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate) + struct page *page, u64 file_offset, + u64 num_bytes, bool uptodate) { struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -401,8 +406,9 @@ void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); cond_wake_up(&entry->wait); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_mark_finished(inode, entry); spin_unlock_irqrestore(&tree->lock, flags); - btrfs_init_work(&entry->work, finish_func, NULL, NULL); + btrfs_init_work(&entry->work, finish_ordered_fn, NULL, NULL); btrfs_queue_work(wq, &entry->work); spin_lock_irqsave(&tree->lock, flags); } @@ -473,6 +479,7 @@ out: if (finished && cached && entry) { *cached = entry; refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_dec_test_pending(inode, entry); } spin_unlock_irqrestore(&tree->lock, flags); return finished; @@ -807,8 +814,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup(inode, entry); + } out: spin_unlock_irqrestore(&tree->lock, flags); return entry; @@ -848,8 +857,10 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( break; } out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_range(inode, entry); + } spin_unlock_irq(&tree->lock); return entry; } @@ -878,6 +889,7 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, ASSERT(list_empty(&ordered->log_list)); list_add_tail(&ordered->log_list, list); refcount_inc(&ordered->refs); + trace_btrfs_ordered_extent_lookup_for_logging(inode, ordered); } spin_unlock_irq(&tree->lock); } @@ -901,6 +913,7 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first(inode, entry); out: spin_unlock_irq(&tree->lock); return entry; @@ -975,8 +988,11 @@ struct btrfs_ordered_extent *btrfs_lookup_first_ordered_range( /* No ordered extent in the range */ entry = NULL; out: - if (entry) + if (entry) { refcount_inc(&entry->refs); + trace_btrfs_ordered_extent_lookup_first_range(inode, entry); + } + spin_unlock_irq(&tree->lock); return entry; } @@ -1055,6 +1071,8 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int ret = 0; + trace_btrfs_ordered_extent_split(BTRFS_I(inode), ordered); + spin_lock_irq(&tree->lock); /* Remove from tree once */ node = &ordered->rb_node; diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index ecad67a2c745..87792f85e2c4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -180,13 +180,14 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) t->last = NULL; } +int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); + void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); void btrfs_mark_ordered_io_finished(struct btrfs_inode *inode, struct page *page, u64 file_offset, - u64 num_bytes, btrfs_func_t finish_func, - bool uptodate); + u64 num_bytes, bool uptodate); bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size); diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index 1b31481f9e72..a2ec8ecae8de 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -380,9 +380,8 @@ static struct prop_handler prop_handlers[] = { }, }; -static int inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *parent) +int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *parent) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -457,41 +456,6 @@ static int inherit_props(struct btrfs_trans_handle *trans, return 0; } -int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - struct inode *dir) -{ - if (!dir) - return 0; - - return inherit_props(trans, inode, dir); -} - -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root) -{ - struct super_block *sb = root->fs_info->sb; - struct inode *parent_inode, *child_inode; - int ret; - - parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root); - if (IS_ERR(parent_inode)) - return PTR_ERR(parent_inode); - - child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root); - if (IS_ERR(child_inode)) { - iput(parent_inode); - return PTR_ERR(child_inode); - } - - ret = inherit_props(trans, child_inode, parent_inode); - iput(child_inode); - iput(parent_inode); - - return ret; -} - void __init btrfs_props_init(void) { int i; diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 59bea741cfcf..ca9dd3df129b 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -23,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir); -int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_root *parent_root); - #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 1866b1f0da01..db723c0026bd 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2290,7 +2290,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans, return 0; if (!extent_buffer_uptodate(root_eb)) { - ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL); + ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL); if (ret) goto out; } @@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, } int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce) + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush) { int ret; ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce); - if (ret <= 0 && ret != -EDQUOT) + if ((ret <= 0 && ret != -EDQUOT) || noflush) return ret; ret = try_flush_qgroup(root); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 880e9df0dac1..0c4dd2a9af96 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, - enum btrfs_qgroup_rsv_type type, bool enforce); + enum btrfs_qgroup_rsv_type type, bool enforce, + bool noflush); /* Reserve metadata space for pertrans and prealloc type */ static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root, int num_bytes, bool enforce) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS, enforce); + BTRFS_QGROUP_RSV_META_PERTRANS, + enforce, false); } static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root, - int num_bytes, bool enforce) + int num_bytes, bool enforce, + bool noflush) { return __btrfs_qgroup_reserve_meta(root, num_bytes, - BTRFS_QGROUP_RSV_META_PREALLOC, enforce); + BTRFS_QGROUP_RSV_META_PREALLOC, + enforce, noflush); } void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0e239a4c3b26..2feb5c20641a 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -52,132 +52,21 @@ struct btrfs_stripe_hash_table { struct btrfs_stripe_hash table[]; }; -enum btrfs_rbio_ops { - BTRFS_RBIO_WRITE, - BTRFS_RBIO_READ_REBUILD, - BTRFS_RBIO_PARITY_SCRUB, - BTRFS_RBIO_REBUILD_MISSING, -}; - -struct btrfs_raid_bio { - struct btrfs_io_context *bioc; - - /* while we're doing rmw on a stripe - * we put it into a hash table so we can - * lock the stripe and merge more rbios - * into it. - */ - struct list_head hash_list; - - /* - * LRU list for the stripe cache - */ - struct list_head stripe_cache; - - /* - * for scheduling work in the helper threads - */ - struct btrfs_work work; - - /* - * bio list and bio_list_lock are used - * to add more bios into the stripe - * in hopes of avoiding the full rmw - */ - struct bio_list bio_list; - spinlock_t bio_list_lock; - - /* also protected by the bio_list_lock, the - * plug list is used by the plugging code - * to collect partial bios while plugged. The - * stripe locking code also uses it to hand off - * the stripe lock to the next pending IO - */ - struct list_head plug_list; - - /* - * flags that tell us if it is safe to - * merge with this bio - */ - unsigned long flags; - - /* size of each individual stripe on disk */ - int stripe_len; - - /* number of data stripes (no p/q) */ - int nr_data; - - int real_stripes; - - int stripe_npages; - /* - * set if we're doing a parity rebuild - * for a read from higher up, which is handled - * differently from a parity rebuild as part of - * rmw - */ - enum btrfs_rbio_ops operation; - - /* first bad stripe */ - int faila; - - /* second bad stripe (for raid6 use) */ - int failb; - - int scrubp; - /* - * number of pages needed to represent the full - * stripe - */ - int nr_pages; - - /* - * size of all the bios in the bio_list. This - * helps us decide if the rbio maps to a full - * stripe or not - */ - int bio_list_bytes; - - int generic_bio_cnt; - - refcount_t refs; - - atomic_t stripes_pending; - - atomic_t error; - /* - * these are two arrays of pointers. We allocate the - * rbio big enough to hold them both and setup their - * locations when the rbio is allocated - */ - - /* pointers to pages that we allocated for - * reading/writing stripes directly from the disk (including P/Q) - */ - struct page **stripe_pages; - - /* - * pointers to the pages in the bio_list. Stored - * here for faster lookup - */ - struct page **bio_pages; - - /* - * bitmap to record which horizontal stripe has data - */ - unsigned long *dbitmap; - - /* allocated with real_stripes-many pointers for finish_*() calls */ - void **finish_pointers; - - /* allocated with stripe_npages-many bits for finish_*() calls */ - unsigned long *finish_pbitmap; +/* + * A bvec like structure to present a sector inside a page. + * + * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + */ +struct sector_ptr { + struct page *page; + unsigned int pgoff:24; + unsigned int uptodate:8; }; static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct btrfs_work *work); -static void read_rebuild_work(struct btrfs_work *work); +static void rmw_work(struct work_struct *work); +static void read_rebuild_work(struct work_struct *work); static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); static void __free_raid_bio(struct btrfs_raid_bio *rbio); @@ -186,12 +75,12 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); -static void scrub_parity_work(struct btrfs_work *work); +static void scrub_parity_work(struct work_struct *work); -static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func) +static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { - btrfs_init_work(&rbio->work, work_func, NULL, NULL); - btrfs_queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); + INIT_WORK(&rbio->work, work_func); + queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work); } /* @@ -239,7 +128,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) /* * caching an rbio means to copy anything from the - * bio_pages array into the stripe_pages array. We + * bio_sectors array into the stripe_pages array. We * use the page uptodate bit in the stripe cache array * to indicate if it has valid data * @@ -255,12 +144,18 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) if (ret) return; - for (i = 0; i < rbio->nr_pages; i++) { - if (!rbio->bio_pages[i]) + for (i = 0; i < rbio->nr_sectors; i++) { + /* Some range not covered by bio (partial write), skip it */ + if (!rbio->bio_sectors[i].page) continue; - copy_highpage(rbio->stripe_pages[i], rbio->bio_pages[i]); - SetPageUptodate(rbio->stripe_pages[i]); + ASSERT(rbio->stripe_sectors[i].page); + memcpy_page(rbio->stripe_sectors[i].page, + rbio->stripe_sectors[i].pgoff, + rbio->bio_sectors[i].page, + rbio->bio_sectors[i].pgoff, + rbio->bioc->fs_info->sectorsize); + rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); } @@ -283,32 +178,86 @@ static int rbio_bucket(struct btrfs_raid_bio *rbio) return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS); } +static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio, + unsigned int page_nr) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + ASSERT(page_nr < rbio->nr_pages); + + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; + i++) { + if (!rbio->stripe_sectors[i].uptodate) + return false; + } + return true; +} + /* - * stealing an rbio means taking all the uptodate pages from the stripe - * array in the source rbio and putting them into the destination rbio + * Update the stripe_sectors[] array to use correct page and pgoff + * + * Should be called every time any page pointer in stripes_pages[] got modified. + */ +static void index_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + u32 offset; + int i; + + for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) { + int page_index = offset >> PAGE_SHIFT; + + ASSERT(page_index < rbio->nr_pages); + rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; + rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + } +} + +static void steal_rbio_page(struct btrfs_raid_bio *src, + struct btrfs_raid_bio *dest, int page_nr) +{ + const u32 sectorsize = src->bioc->fs_info->sectorsize; + const u32 sectors_per_page = PAGE_SIZE / sectorsize; + int i; + + if (dest->stripe_pages[page_nr]) + __free_page(dest->stripe_pages[page_nr]); + dest->stripe_pages[page_nr] = src->stripe_pages[page_nr]; + src->stripe_pages[page_nr] = NULL; + + /* Also update the sector->uptodate bits. */ + for (i = sectors_per_page * page_nr; + i < sectors_per_page * page_nr + sectors_per_page; i++) + dest->stripe_sectors[i].uptodate = true; +} + +/* + * Stealing an rbio means taking all the uptodate pages from the stripe array + * in the source rbio and putting them into the destination rbio. + * + * This will also update the involved stripe_sectors[] which are referring to + * the old pages. */ static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; struct page *s; - struct page *d; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { s = src->stripe_pages[i]; - if (!s || !PageUptodate(s)) { + if (!s || !full_page_sectors_uptodate(src, i)) continue; - } - d = dest->stripe_pages[i]; - if (d) - __free_page(d); - - dest->stripe_pages[i] = s; - src->stripe_pages[i] = NULL; + steal_rbio_page(src, dest, i); } + index_stripe_sectors(dest); + index_stripe_sectors(src); } /* @@ -323,6 +272,9 @@ static void merge_rbio(struct btrfs_raid_bio *dest, { bio_list_merge(&dest->bio_list, &victim->bio_list); dest->bio_list_bytes += victim->bio_list_bytes; + /* Also inherit the bitmaps from @victim. */ + bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap, + dest->stripe_nsectors); dest->generic_bio_cnt += victim->generic_bio_cnt; bio_list_init(&victim->bio_list); } @@ -522,9 +474,9 @@ static int rbio_is_full(struct btrfs_raid_bio *rbio) int ret = 1; spin_lock_irqsave(&rbio->bio_list_lock, flags); - if (size != rbio->nr_data * rbio->stripe_len) + if (size != rbio->nr_data * BTRFS_STRIPE_LEN) ret = 0; - BUG_ON(size > rbio->nr_data * rbio->stripe_len); + BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN); spin_unlock_irqrestore(&rbio->bio_list_lock, flags); return ret; @@ -600,39 +552,39 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } -static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, - int index) +static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return stripe * rbio->stripe_npages + index; + ASSERT(stripe_nr < rbio->real_stripes); + ASSERT(sector_nr < rbio->stripe_nsectors); + + return stripe_nr * rbio->stripe_nsectors + sector_nr; } -/* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, - int index) +/* Return a sector from rbio->stripe_sectors, not from the bio list */ +static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int stripe_nr, + unsigned int sector_nr) { - return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; + return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr, + sector_nr)]; } -/* - * helper to index into the pstripe - */ -static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside P stripe */ +static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { - return rbio_stripe_page(rbio, rbio->nr_data, index); + return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr); } -/* - * helper to index into the qstripe, returns null - * if there is no qstripe - */ -static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) +/* Grab a sector inside Q stripe, return NULL if not RAID6 */ +static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio, + unsigned int sector_nr) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - return rbio_stripe_page(rbio, rbio->nr_data + 1, index); + return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr); } /* @@ -864,6 +816,12 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) if (rbio->generic_bio_cnt) btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt); + /* + * Clear the data bitmap, as the rbio may be cached for later usage. + * do this before before unlock_stripe() so there will be no new bio + * for this bio. + */ + bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors); /* * At this moment, rbio->bio_list is empty, however since rbio does not @@ -911,47 +869,43 @@ static void raid_write_end_io(struct bio *bio) rbio_orig_end_io(rbio, err); } -/* - * the read/modify/write code wants to use the original bio for - * any pages it included, and then use the rbio for everything - * else. This function decides if a given index (stripe number) - * and page number in that stripe fall inside the original bio - * or the rbio. +/** + * Get a sector pointer specified by its @stripe_nr and @sector_nr * - * if you set bio_list_only, you'll get a NULL back for any ranges - * that are outside the bio_list + * @rbio: The raid bio + * @stripe_nr: Stripe number, valid range [0, real_stripe) + * @sector_nr: Sector number inside the stripe, + * valid range [0, stripe_nsectors) + * @bio_list_only: Whether to use sectors inside the bio list only. * - * This doesn't take any refs on anything, you get a bare page pointer - * and the caller must bump refs as required. - * - * You must call index_rbio_pages once before you can trust - * the answers from this function. + * The read/modify/write code wants to reuse the original bio page as much + * as possible, and only use stripe_sectors as fallback. */ -static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, - int index, int pagenr, int bio_list_only) +static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr, + bool bio_list_only) { - int chunk_page; - struct page *p = NULL; + struct sector_ptr *sector; + int index; + + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); - chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr; + index = stripe_nr * rbio->stripe_nsectors + sector_nr; + ASSERT(index >= 0 && index < rbio->nr_sectors); spin_lock_irq(&rbio->bio_list_lock); - p = rbio->bio_pages[chunk_page]; + sector = &rbio->bio_sectors[index]; + if (sector->page || bio_list_only) { + /* Don't return sector without a valid page pointer */ + if (!sector->page) + sector = NULL; + spin_unlock_irq(&rbio->bio_list_lock); + return sector; + } spin_unlock_irq(&rbio->bio_list_lock); - if (p || bio_list_only) - return p; - - return rbio->stripe_pages[chunk_page]; -} - -/* - * number of pages we need for the entire stripe across all the - * drives - */ -static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) -{ - return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes; + return &rbio->stripe_sectors[index]; } /* @@ -959,23 +913,30 @@ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) * this does not allocate any pages for rbio->pages. */ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, - struct btrfs_io_context *bioc, - u64 stripe_len) -{ + struct btrfs_io_context *bioc) +{ + const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; + const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT; + const unsigned int num_pages = stripe_npages * real_stripes; + const unsigned int stripe_nsectors = + BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; + const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - int nr_data = 0; - int real_stripes = bioc->num_stripes - bioc->num_tgtdevs; - int num_pages = rbio_nr_pages(stripe_len, real_stripes); - int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE); void *p; + /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ + ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); + /* + * Our current stripe len should be fixed to 64k thus stripe_nsectors + * (at most 16) should be no larger than BITS_PER_LONG. + */ + ASSERT(stripe_nsectors <= BITS_PER_LONG); + rbio = kzalloc(sizeof(*rbio) + sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_pages) * num_pages + - sizeof(*rbio->finish_pointers) * real_stripes + - sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) + - sizeof(*rbio->finish_pbitmap) * - BITS_TO_LONGS(stripe_npages), + sizeof(*rbio->bio_sectors) * num_sectors + + sizeof(*rbio->stripe_sectors) * num_sectors + + sizeof(*rbio->finish_pointers) * real_stripes, GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -986,10 +947,11 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&rbio->stripe_cache); INIT_LIST_HEAD(&rbio->hash_list); rbio->bioc = bioc; - rbio->stripe_len = stripe_len; rbio->nr_pages = num_pages; + rbio->nr_sectors = num_sectors; rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; + rbio->stripe_nsectors = stripe_nsectors; rbio->faila = -1; rbio->failb = -1; refcount_set(&rbio->refs, 1); @@ -997,8 +959,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, atomic_set(&rbio->stripes_pending, 0); /* - * the stripe_pages, bio_pages, etc arrays point to the extra - * memory we allocated past the end of the rbio + * The stripe_pages, bio_sectors, etc arrays point to the extra memory + * we allocated past the end of the rbio. */ p = rbio + 1; #define CONSUME_ALLOC(ptr, count) do { \ @@ -1006,79 +968,76 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ } while (0) CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_pages, num_pages); + CONSUME_ALLOC(rbio->bio_sectors, num_sectors); + CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); CONSUME_ALLOC(rbio->finish_pointers, real_stripes); - CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages)); - CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages)); #undef CONSUME_ALLOC - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5) - nr_data = real_stripes - 1; - else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) - nr_data = real_stripes - 2; - else - BUG(); + ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); + rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); - rbio->nr_data = nr_data; return rbio; } /* allocate pages for all the stripes in the bio, including parity */ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + int ret; - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + if (ret < 0) + return ret; + /* Mapping all sectors */ + index_stripe_sectors(rbio); return 0; } /* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { - int i; - struct page *page; + const int data_pages = rbio->nr_data * rbio->stripe_npages; + int ret; - i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); + ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, + rbio->stripe_pages + data_pages); + if (ret < 0) + return ret; - for (; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) - continue; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[i] = page; - } + index_stripe_sectors(rbio); return 0; } /* - * add a single page from a specific stripe into our list of bios for IO - * this will try to merge into existing bios if possible, and returns - * zero if all went well. + * Add a single sector @sector into our list of bios for IO. + * + * Return 0 if everything went well. + * Return <0 for error. */ -static int rbio_add_io_page(struct btrfs_raid_bio *rbio, - struct bio_list *bio_list, - struct page *page, - int stripe_nr, - unsigned long page_index, - unsigned long bio_max_len) -{ +static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list, + struct sector_ptr *sector, + unsigned int stripe_nr, + unsigned int sector_nr, + enum req_op op) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio *last = bio_list->tail; int ret; struct bio *bio; struct btrfs_io_stripe *stripe; u64 disk_start; + /* + * Note: here stripe_nr has taken device replace into consideration, + * thus it can be larger than rbio->real_stripe. + * So here we check against bioc->num_stripes, not rbio->real_stripes. + */ + ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes); + ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors); + ASSERT(sector->page); + stripe = &rbio->bioc->stripes[stripe_nr]; - disk_start = stripe->physical + (page_index << PAGE_SHIFT); + disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ if (!stripe->dev->bdev) @@ -1095,20 +1054,21 @@ static int rbio_add_io_page(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, page, PAGE_SIZE, 0); - if (ret == PAGE_SIZE) + ret = bio_add_page(last, sector->page, sectorsize, + sector->pgoff); + if (ret == sectorsize) return 0; } } /* put a new bio on the list */ - bio = btrfs_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1); - btrfs_bio(bio)->device = stripe->dev; - bio->bi_iter.bi_size = 0; - bio_set_dev(bio, stripe->dev->bdev); + bio = bio_alloc(stripe->dev->bdev, + max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1), + op, GFP_NOFS); bio->bi_iter.bi_sector = disk_start >> 9; + bio->bi_private = rbio; - bio_add_page(bio, page, PAGE_SIZE, 0); + bio_add_page(bio, sector->page, sectorsize, sector->pgoff); bio_list_add(bio_list, bio); return 0; } @@ -1130,6 +1090,29 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } } +static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) +{ + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct bio_vec bvec; + struct bvec_iter iter; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + + bio_for_each_segment(bvec, bio, iter) { + u32 bvec_offset; + + for (bvec_offset = 0; bvec_offset < bvec.bv_len; + bvec_offset += sectorsize, offset += sectorsize) { + int index = offset / sectorsize; + struct sector_ptr *sector = &rbio->bio_sectors[index]; + + sector->page = bvec.bv_page; + sector->pgoff = bvec.bv_offset + bvec_offset; + ASSERT(sector->pgoff < PAGE_SIZE); + } + } +} + /* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly @@ -1141,29 +1124,40 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) static void index_rbio_pages(struct btrfs_raid_bio *rbio) { struct bio *bio; - u64 start; - unsigned long stripe_offset; - unsigned long page_index; spin_lock_irq(&rbio->bio_list_lock); - bio_list_for_each(bio, &rbio->bio_list) { - struct bio_vec bvec; - struct bvec_iter iter; - int i = 0; + bio_list_for_each(bio, &rbio->bio_list) + index_one_bio(rbio, bio); + + spin_unlock_irq(&rbio->bio_list_lock); +} + +static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio, + struct raid56_bio_trace_info *trace_info) +{ + const struct btrfs_io_context *bioc = rbio->bioc; + int i; - start = bio->bi_iter.bi_sector << 9; - stripe_offset = start - rbio->bioc->raid_map[0]; - page_index = stripe_offset >> PAGE_SHIFT; + ASSERT(bioc); - if (bio_flagged(bio, BIO_CLONED)) - bio->bi_iter = btrfs_bio(bio)->iter; + /* We rely on bio->bi_bdev to find the stripe number. */ + if (!bio->bi_bdev) + goto not_found; - bio_for_each_segment(bvec, bio, iter) { - rbio->bio_pages[page_index + i] = bvec.bv_page; - i++; - } + for (i = 0; i < bioc->num_stripes; i++) { + if (bio->bi_bdev != bioc->stripes[i].dev->bdev) + continue; + trace_info->stripe_nr = i; + trace_info->devid = bioc->stripes[i].dev->devid; + trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + bioc->stripes[i].physical; + return; } - spin_unlock_irq(&rbio->bio_list_lock); + +not_found: + trace_info->devid = -1; + trace_info->offset = -1; + trace_info->stripe_nr = -1; } /* @@ -1177,10 +1171,14 @@ static void index_rbio_pages(struct btrfs_raid_bio *rbio) static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; int nr_data = rbio->nr_data; + /* The total sector number inside the full stripe. */ + int total_sector_nr; int stripe; - int pagenr; + /* Sector number inside a stripe. */ + int sectornr; bool has_qstripe; struct bio_list bio_list; struct bio *bio; @@ -1195,6 +1193,9 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else BUG(); + /* We should have at least one data sector. */ + ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); + /* at this point we either have a full stripe, * or we've read the full stripe from the drive. * recalculate the parity and write the new results. @@ -1224,86 +1225,108 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - /* first collect one page from each data stripe */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + + /* First collect one sector from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } - /* then add the parity stripe */ - p = rbio_pstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; if (has_qstripe) { - /* - * raid6, add the qstripe and call the - * library function to fill in our p/q + * RAID6, add the qstripe and call the library function + * to fill in our p/q */ - p = rbio_qstripe_page(rbio, pagenr); - SetPageUptodate(p); - pointers[stripe++] = kmap_local_page(p); + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } for (stripe = stripe - 1; stripe >= 0; stripe--) kunmap_local(pointers[stripe]); } /* - * time to start writing. Make bios for everything from the - * higher layers (the bio_list in our rbio) and our p/q. Ignore - * everything else. + * Start writing. Make bios for everything from the higher layers (the + * bio_list in our rbio) and our P/Q. Ignore everything else. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; - ret = rbio_add_io_page(rbio, &bio_list, - page, stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } if (likely(!bioc->num_tgtdevs)) goto write_data; - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - if (!bioc->tgtdev_map[stripe]) + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + + stripe = total_sector_nr / rbio->stripe_nsectors; + sectornr = total_sector_nr % rbio->stripe_nsectors; + + if (!bioc->tgtdev_map[stripe]) { + /* + * We can skip the whole stripe completely, note + * total_sector_nr will be increased by one anyway. + */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; + } - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - if (stripe < rbio->nr_data) { - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (!page) - continue; - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } + /* This vertical stripe has no data, skip it. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - rbio->bioc->tgtdev_map[stripe], - pagenr, rbio->stripe_len); - if (ret) - goto cleanup; + if (stripe < rbio->nr_data) { + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (!sector) + continue; + } else { + sector = rbio_stripe_sector(rbio, stripe, sectornr); } + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + rbio->bioc->tgtdev_map[stripe], + sectornr, REQ_OP_WRITE); + if (ret) + goto cleanup; } write_data: @@ -1311,10 +1334,14 @@ write_data: BUG_ON(atomic_read(&rbio->stripes_pending) == 0); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -1342,7 +1369,7 @@ static int find_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->bioc->num_stripes; i++) { stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, rbio->stripe_len) && + if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { return i; } @@ -1364,7 +1391,7 @@ static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, for (i = 0; i < rbio->nr_data; i++) { u64 stripe_start = rbio->bioc->raid_map[i]; - if (in_range(logical, stripe_start, rbio->stripe_len)) + if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) return i; } return -1; @@ -1417,56 +1444,89 @@ static int fail_bio_stripe(struct btrfs_raid_bio *rbio, } /* + * For subpage case, we can no longer set page Uptodate directly for + * stripe_pages[], thus we need to locate the sector. + */ +static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, + struct page *page, + unsigned int pgoff) +{ + int i; + + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + + if (sector->page == page && sector->pgoff == pgoff) + return sector; + } + return NULL; +} + +/* * this sets each page in the bio uptodate. It should only be used on private * rbio pages, nothing that comes in from the higher layers */ -static void set_bio_pages_uptodate(struct bio *bio) +static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; struct bio_vec *bvec; struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) - SetPageUptodate(bvec->bv_page); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct sector_ptr *sector; + int pgoff; + + for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; + pgoff += sectorsize) { + sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + ASSERT(sector); + if (sector) + sector->uptodate = 1; + } + } } -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid_rmw_end_io(struct bio *bio) +static void raid56_bio_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; if (bio->bi_status) fail_bio_stripe(rbio, bio); else - set_bio_pages_uptodate(bio); + set_bio_pages_uptodate(rbio, bio); bio_put(bio); - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + if (atomic_dec_and_test(&rbio->stripes_pending)) + queue_work(rbio->bioc->fs_info->endio_raid56_workers, + &rbio->end_io_work); +} - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; +/* + * End io handler for the read phase of the RMW cycle. All the bios here are + * physical stripe bios we've read from the disk so we can recalculate the + * parity of the stripe. + * + * This will usually kick off finish_rmw once all the bios are read in, but it + * may trigger parity reconstruction if we had any errors along the way + */ +static void raid56_rmw_end_io_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); + + if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { + rbio_orig_end_io(rbio, BLK_STS_IOERR); + return; + } /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write but if there + * are any failed stripes we'll reconstruct from parity first. */ validate_rbio_for_rmw(rbio); - return; - -cleanup: - - rbio_orig_end_io(rbio, BLK_STS_IOERR); } /* @@ -1477,9 +1537,9 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) { int bios_to_read = 0; struct bio_list bio_list; + const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; int ret; - int pagenr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -1491,36 +1551,34 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *page; - /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. - */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) - continue; + /* Build a list of bios to read all the missing data sectors. */ + for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; - page = rbio_stripe_page(rbio, stripe, pagenr); - /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it - */ - if (PageUptodate(page)) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a page + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate page. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -1539,13 +1597,16 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; - bio->bi_end_io = raid_rmw_end_io; - bio->bi_opf = REQ_OP_READ; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_read_partial_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_read_partial(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -1624,7 +1685,7 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct btrfs_work work; + struct work_struct work; }; /* @@ -1692,7 +1753,7 @@ static void run_plug(struct btrfs_plug_cb *plug) * if the unplug comes from schedule, we have to push the * work off to a helper thread */ -static void unplug_work(struct btrfs_work *work) +static void unplug_work(struct work_struct *work) { struct btrfs_plug_cb *plug; plug = container_of(work, struct btrfs_plug_cb, work); @@ -1705,36 +1766,60 @@ static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) plug = container_of(cb, struct btrfs_plug_cb, cb); if (from_schedule) { - btrfs_init_work(&plug->work, unplug_work, NULL, NULL); - btrfs_queue_work(plug->info->rmw_workers, - &plug->work); + INIT_WORK(&plug->work, unplug_work); + queue_work(plug->info->rmw_workers, &plug->work); return; } run_plug(plug); } +/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ +static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) +{ + const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT; + const u64 full_stripe_start = rbio->bioc->raid_map[0]; + const u32 orig_len = orig_bio->bi_iter.bi_size; + const u32 sectorsize = fs_info->sectorsize; + u64 cur_logical; + + ASSERT(orig_logical >= full_stripe_start && + orig_logical + orig_len <= full_stripe_start + + rbio->nr_data * BTRFS_STRIPE_LEN); + + bio_list_add(&rbio->bio_list, orig_bio); + rbio->bio_list_bytes += orig_bio->bi_iter.bi_size; + + /* Update the dbitmap. */ + for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len; + cur_logical += sectorsize) { + int bit = ((u32)(cur_logical - full_stripe_start) >> + fs_info->sectorsize_bits) % rbio->stripe_nsectors; + + set_bit(bit, &rbio->dbitmap); + } +} + /* * our main entry point for writes from the rest of the FS. */ -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len) +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; struct btrfs_plug_cb *plug = NULL; struct blk_plug_cb *cb; - int ret; + int ret = 0; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + ret = PTR_ERR(rbio); + goto out_dec_counter; } - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; rbio->operation = BTRFS_RBIO_WRITE; + rbio_add_bio(rbio, bio); - btrfs_bio_counter_inc_noblocked(fs_info); rbio->generic_bio_cnt = 1; /* @@ -1744,8 +1829,8 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, if (rbio_is_full(rbio)) { ret = full_stripe_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); - return ret; + goto out_dec_counter; + return; } cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); @@ -1756,13 +1841,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - ret = 0; } else { ret = __raid56_parity_write(rbio); if (ret) - btrfs_bio_counter_dec(fs_info); + goto out_dec_counter; } - return ret; + + return; + +out_dec_counter: + btrfs_bio_counter_dec(fs_info); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); } /* @@ -1772,14 +1862,18 @@ int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, */ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { - int pagenr, stripe; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int sectornr, stripe; void **pointers; void **unmap_array; int faila = -1, failb = -1; - struct page *page; blk_status_t err; int i; + /* + * This array stores the pointer for each sector, thus it has the extra + * pgoff value added from each sector + */ pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); if (!pointers) { err = BLK_STS_RESOURCE; @@ -1808,43 +1902,44 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + struct sector_ptr *sector; + /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. */ if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(pagenr, rbio->dbitmap)) + !test_bit(sectornr, &rbio->dbitmap)) continue; /* - * Setup our array of pointers with pages from each stripe + * Setup our array of pointers with sectors from each stripe * * NOTE: store a duplicate array of pointers to preserve the * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* - * if we're rebuilding a read, we have to use + * If we're rebuilding a read, we have to use * pages from the bio list */ if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); } else { - page = rbio_stripe_page(rbio, stripe, pagenr); + sector = rbio_stripe_sector(rbio, stripe, sectornr); } - pointers[stripe] = kmap_local_page(page); + ASSERT(sector->page); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; unmap_array[stripe] = pointers[stripe]; } - /* all raid6 handling here */ + /* All raid6 handling here */ if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* - * single failure, rebuild from parity raid5 - * style - */ + /* Single failure, rebuild from parity raid5 style */ if (failb < 0) { if (faila == rbio->nr_data) { /* @@ -1887,10 +1982,10 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { raid6_datap_recov(rbio->real_stripes, - PAGE_SIZE, faila, pointers); + sectorsize, faila, pointers); } else { raid6_2data_recov(rbio->real_stripes, - PAGE_SIZE, faila, failb, + sectorsize, faila, failb, pointers); } } else { @@ -1900,7 +1995,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) BUG_ON(failb != -1); pstripe: /* Copy parity block into failed block to start with */ - copy_page(pointers[faila], pointers[rbio->nr_data]); + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); /* rearrange the pointer array */ p = pointers[faila]; @@ -1909,7 +2004,7 @@ pstripe: pointers[rbio->nr_data - 1] = p; /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE); + run_xor(pointers, rbio->nr_data - 1, sectorsize); } /* if we're doing this rebuild as part of an rmw, go through * and set all of our private rbio pages in the @@ -1918,14 +2013,14 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_npages; i++) { + for (i = 0; i < rbio->stripe_nsectors; i++) { if (faila != -1) { - page = rbio_stripe_page(rbio, faila, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, faila, i); + sector->uptodate = 1; } if (failb != -1) { - page = rbio_stripe_page(rbio, failb, i); - SetPageUptodate(page); + sector = rbio_stripe_sector(rbio, failb, i); + sector->uptodate = 1; } } } @@ -1984,25 +2079,13 @@ cleanup_io: } /* - * This is called only for stripes we've read from disk to - * reconstruct the parity. + * This is called only for stripes we've read from disk to reconstruct the + * parity. */ -static void raid_recover_end_io(struct bio *bio) +static void raid_recover_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - /* - * we only read stripe pages off the disk, set them - * up to date if there were no errors - */ - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); if (atomic_read(&rbio->error) > rbio->bioc->max_errors) rbio_orig_end_io(rbio, BLK_STS_IOERR); @@ -2023,8 +2106,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2036,33 +2118,31 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) atomic_set(&rbio->error, 0); /* - * read everything that hasn't failed. Thanks to the - * stripe cache, it is possible that some or all of these - * pages are going to be uptodate. + * Read everything that hasn't failed. However this time we will + * not trust any cached sector. + * As we may read out some stale data but higher layer is not reading + * that stale part. + * + * So here we always re-read everything in recovery path. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + struct sector_ptr *sector; + if (rbio->faila == stripe || rbio->failb == stripe) { atomic_inc(&rbio->error); + /* Skip the current stripe. */ + ASSERT(sectornr == 0); + total_sector_nr += rbio->stripe_nsectors - 1; continue; } - - for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { - struct page *p; - - /* - * the rmw code may have already read this - * page in - */ - p = rbio_stripe_page(rbio, stripe, pagenr); - if (PageUptodate(p)) - continue; - - ret = rbio_add_io_page(rbio, &bio_list, - rbio_stripe_page(rbio, stripe, pagenr), - stripe, pagenr, rbio->stripe_len); - if (ret < 0) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret < 0) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2085,13 +2165,16 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; - bio->bi_end_io = raid_recover_end_io; - bio->bi_opf = REQ_OP_READ; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_recover_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); + } submit_bio(bio); } @@ -2114,28 +2197,27 @@ cleanup: * so we assume the bio they send down corresponds to a failed part * of the drive. */ -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io) +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - int ret; if (generic_io) { ASSERT(bioc->mirror_num == mirror_num); btrfs_bio(bio)->mirror_num = mirror_num; + } else { + btrfs_get_bioc(bioc); } - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { - if (generic_io) - btrfs_put_bioc(bioc); - return PTR_ERR(rbio); + bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); + goto out_end_bio; } rbio->operation = BTRFS_RBIO_READ_REBUILD; - bio_list_add(&rbio->bio_list, bio); - rbio->bio_list_bytes = bio->bi_iter.bi_size; + rbio_add_bio(rbio, bio); rbio->faila = find_logical_bio_stripe(rbio, bio); if (rbio->faila == -1) { @@ -2143,18 +2225,13 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", __func__, bio->bi_iter.bi_sector << 9, (u64)bio->bi_iter.bi_size, bioc->map_type); - if (generic_io) - btrfs_put_bioc(bioc); kfree(rbio); - return -EIO; + bio->bi_status = BLK_STS_IOERR; + goto out_end_bio; } - if (generic_io) { - btrfs_bio_counter_inc_noblocked(fs_info); + if (generic_io) rbio->generic_bio_cnt = 1; - } else { - btrfs_get_bioc(bioc); - } /* * Loop retry: @@ -2173,27 +2250,23 @@ int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio->failb--; } - ret = lock_stripe_add(rbio); + if (lock_stripe_add(rbio)) + return; /* - * __raid56_parity_recover will end the bio with - * any errors it hits. We don't want to return - * its error value up the stack because our caller - * will end up calling bio_endio with any nonzero - * return - */ - if (ret == 0) - __raid56_parity_recover(rbio); - /* - * our rbio has been added to the list of - * rbios that will be handled after the - * currently lock owner is done + * This adds our rbio to the list of rbios that will be handled after + * the current lock owner is done. */ - return 0; + __raid56_parity_recover(rbio); + return; +out_end_bio: + btrfs_bio_counter_dec(fs_info); + btrfs_put_bioc(bioc); + bio_endio(bio); } -static void rmw_work(struct btrfs_work *work) +static void rmw_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2201,7 +2274,7 @@ static void rmw_work(struct btrfs_work *work) raid56_rmw_stripe(rbio); } -static void read_rebuild_work(struct btrfs_work *work) +static void read_rebuild_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2221,14 +2294,14 @@ static void read_rebuild_work(struct btrfs_work *work) struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, struct btrfs_device *scrub_dev, + struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; int i; - rbio = alloc_rbio(fs_info, bioc, stripe_len); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; bio_list_add(&rbio->bio_list, bio); @@ -2252,10 +2325,7 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, } ASSERT(i < rbio->real_stripes); - /* Now we just support the sectorsize equals to page size */ - ASSERT(fs_info->sectorsize == PAGE_SIZE); - ASSERT(rbio->stripe_npages == stripe_nsectors); - bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors); + bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors); /* * We have already increased bio_counter when getting bioc, record it @@ -2268,17 +2338,19 @@ struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, /* Used for both parity scrub and missing. */ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical) + unsigned int pgoff, u64 logical) { + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; int stripe_offset; int index; ASSERT(logical >= rbio->bioc->raid_map[0]); - ASSERT(logical + PAGE_SIZE <= rbio->bioc->raid_map[0] + - rbio->stripe_len * rbio->nr_data); + ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] + + BTRFS_STRIPE_LEN * rbio->nr_data); stripe_offset = (int)(logical - rbio->bioc->raid_map[0]); - index = stripe_offset >> PAGE_SHIFT; - rbio->bio_pages[index] = page; + index = stripe_offset / sectorsize; + rbio->bio_sectors[index].page = page; + rbio->bio_sectors[index].pgoff = pgoff; } /* @@ -2287,23 +2359,25 @@ void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, */ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) { - int i; - int bit; - int index; - struct page *page; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + int total_sector_nr; - for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) { - for (i = 0; i < rbio->real_stripes; i++) { - index = i * rbio->stripe_npages + bit; - if (rbio->stripe_pages[index]) - continue; + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct page *page; + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT; - page = alloc_page(GFP_NOFS); - if (!page) - return -ENOMEM; - rbio->stripe_pages[index] = page; - } + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; + if (rbio->stripe_pages[index]) + continue; + page = alloc_page(GFP_NOFS); + if (!page) + return -ENOMEM; + rbio->stripe_pages[index] = page; } + index_stripe_sectors(rbio); return 0; } @@ -2311,14 +2385,15 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; + const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - unsigned long *pbitmap = rbio->finish_pbitmap; + unsigned long *pbitmap = &rbio->finish_pbitmap; int nr_data = rbio->nr_data; int stripe; - int pagenr; + int sectornr; bool has_qstripe; - struct page *p_page = NULL; - struct page *q_page = NULL; + struct sector_ptr p_sector = { 0 }; + struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; struct bio *bio; int is_replace = 0; @@ -2335,7 +2410,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) { is_replace = 1; - bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages); + bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors); } /* @@ -2348,54 +2423,59 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!need_check) goto writeback; - p_page = alloc_page(GFP_NOFS); - if (!p_page) + p_sector.page = alloc_page(GFP_NOFS); + if (!p_sector.page) goto cleanup; - SetPageUptodate(p_page); + p_sector.pgoff = 0; + p_sector.uptodate = 1; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_page = alloc_page(GFP_NOFS); - if (!q_page) { - __free_page(p_page); + q_sector.page = alloc_page(GFP_NOFS); + if (!q_sector.page) { + __free_page(p_sector.page); + p_sector.page = NULL; goto cleanup; } - SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); + q_sector.pgoff = 0; + q_sector.uptodate = 1; + pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_page); + pointers[nr_data] = kmap_local_page(p_sector.page); - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *p; + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; void *parity; + /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { - p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap_local_page(p); + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; } if (has_qstripe) { /* RAID6, call the library function to fill in our P/Q */ - raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, pointers); } else { /* raid5 */ - copy_page(pointers[nr_data], pointers[0]); - run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); + memcpy(pointers[nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, nr_data - 1, sectorsize); } /* Check scrubbing parity and repair it */ - p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap_local_page(p); - if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) - copy_page(parity, pointers[rbio->scrubp]); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + parity = kmap_local_page(sector->page) + sector->pgoff; + if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) + memcpy(parity, pointers[rbio->scrubp], sectorsize); else /* Parity is right, needn't writeback */ - bitmap_clear(rbio->dbitmap, pagenr, 1); + bitmap_clear(&rbio->dbitmap, sectornr, 1); kunmap_local(parity); for (stripe = nr_data - 1; stripe >= 0; stripe--) @@ -2403,10 +2483,12 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, } kunmap_local(pointers[nr_data]); - __free_page(p_page); - if (q_page) { + __free_page(p_sector.page); + p_sector.page = NULL; + if (q_sector.page) { kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_page); + __free_page(q_sector.page); + q_sector.page = NULL; } writeback: @@ -2415,12 +2497,12 @@ writeback: * higher layers (the bio_list in our rbio) and our p/q. Ignore * everything else. */ - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, - page, rbio->scrubp, pagenr, rbio->stripe_len); + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp, + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2428,13 +2510,13 @@ writeback: if (!is_replace) goto submit_write; - for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) { - struct page *page; + for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) { + struct sector_ptr *sector; - page = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - ret = rbio_add_io_page(rbio, &bio_list, page, + sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); + ret = rbio_add_io_sector(rbio, &bio_list, sector, bioc->tgtdev_map[rbio->scrubp], - pagenr, rbio->stripe_len); + sectornr, REQ_OP_WRITE); if (ret) goto cleanup; } @@ -2450,10 +2532,14 @@ submit_write: atomic_set(&rbio->stripes_pending, nr_data); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; bio->bi_end_io = raid_write_end_io; - bio->bi_opf = REQ_OP_WRITE; + if (trace_raid56_scrub_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); + } submit_bio(bio); } return; @@ -2541,24 +2627,14 @@ cleanup: * This will usually kick off finish_rmw once all the bios are read in, but it * may trigger parity reconstruction if we had any errors along the way */ -static void raid56_parity_scrub_end_io(struct bio *bio) +static void raid56_parity_scrub_end_io_work(struct work_struct *work) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; + struct btrfs_raid_bio *rbio = + container_of(work, struct btrfs_raid_bio, end_io_work); /* - * this will normally call finish_rmw to start our write - * but if there are any failed stripes we'll reconstruct - * from parity first + * This will normally call finish_rmw to start our write, but if there + * are any failed stripes we'll reconstruct from parity first */ validate_rbio_for_parity_scrub(rbio); } @@ -2568,8 +2644,7 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int pagenr; - int stripe; + int total_sector_nr; struct bio *bio; bio_list_init(&bio_list); @@ -2579,36 +2654,38 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) goto cleanup; atomic_set(&rbio->error, 0); - /* - * build a list of bios to read all the missing parts of this - * stripe - */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { - struct page *page; - /* - * we want to find all the pages missing from - * the rbio and read them from the disk. If - * page_in_rbio finds a page in the bio list - * we don't need to read it off the stripe. - */ - page = page_in_rbio(rbio, stripe, pagenr, 1); - if (page) - continue; + /* Build a list of bios to read all the missing parts. */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + int sectornr = total_sector_nr % rbio->stripe_nsectors; + int stripe = total_sector_nr / rbio->stripe_nsectors; + struct sector_ptr *sector; + + /* No data in the vertical stripe, no need to read. */ + if (!test_bit(sectornr, &rbio->dbitmap)) + continue; - page = rbio_stripe_page(rbio, stripe, pagenr); - /* - * the bio cache may have handed us an uptodate - * page. If so, be happy and use it - */ - if (PageUptodate(page)) - continue; + /* + * We want to find all the sectors missing from the rbio and + * read them from the disk. If sector_in_rbio() finds a sector + * in the bio list we don't need to read it off the stripe. + */ + sector = sector_in_rbio(rbio, stripe, sectornr, 1); + if (sector) + continue; - ret = rbio_add_io_page(rbio, &bio_list, page, - stripe, pagenr, rbio->stripe_len); - if (ret) - goto cleanup; - } + sector = rbio_stripe_sector(rbio, stripe, sectornr); + /* + * The bio cache may have handed us an uptodate sector. If so, + * use it. + */ + if (sector->uptodate) + continue; + + ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + sectornr, REQ_OP_READ); + if (ret) + goto cleanup; } bios_to_read = bio_list_size(&bio_list); @@ -2627,13 +2704,16 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) * touch it after that. */ atomic_set(&rbio->stripes_pending, bios_to_read); + INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); while ((bio = bio_list_pop(&bio_list))) { - bio->bi_private = rbio; - bio->bi_end_io = raid56_parity_scrub_end_io; - bio->bi_opf = REQ_OP_READ; + bio->bi_end_io = raid56_bio_end_io; - btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56); + if (trace_raid56_scrub_read_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_scrub_read(rbio, bio, &trace_info); + } submit_bio(bio); } /* the actual write will happen once the reads are done */ @@ -2651,7 +2731,7 @@ finish: validate_rbio_for_parity_scrub(rbio); } -static void scrub_parity_work(struct btrfs_work *work) +static void scrub_parity_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; @@ -2668,13 +2748,12 @@ void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) /* The following code is used for dev replace of a missing RAID 5/6 device. */ struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length) +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) { struct btrfs_fs_info *fs_info = bioc->fs_info; struct btrfs_raid_bio *rbio; - rbio = alloc_rbio(fs_info, bioc, length); + rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) return NULL; diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 72c00fc284b5..6f48f9e4c869 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -7,46 +7,179 @@ #ifndef BTRFS_RAID56_H #define BTRFS_RAID56_H -static inline int nr_parity_stripes(const struct map_lookup *map) -{ - if (map->type & BTRFS_BLOCK_GROUP_RAID5) - return 1; - else if (map->type & BTRFS_BLOCK_GROUP_RAID6) - return 2; - else - return 0; -} +#include <linux/workqueue.h> +#include "volumes.h" + +enum btrfs_rbio_ops { + BTRFS_RBIO_WRITE, + BTRFS_RBIO_READ_REBUILD, + BTRFS_RBIO_PARITY_SCRUB, + BTRFS_RBIO_REBUILD_MISSING, +}; + +struct btrfs_raid_bio { + struct btrfs_io_context *bioc; + + /* + * While we're doing RMW on a stripe we put it into a hash table so we + * can lock the stripe and merge more rbios into it. + */ + struct list_head hash_list; + + /* LRU list for the stripe cache */ + struct list_head stripe_cache; + + /* For scheduling work in the helper threads */ + struct work_struct work; + + /* + * bio_list and bio_list_lock are used to add more bios into the stripe + * in hopes of avoiding the full RMW + */ + struct bio_list bio_list; + spinlock_t bio_list_lock; + + /* + * Also protected by the bio_list_lock, the plug list is used by the + * plugging code to collect partial bios while plugged. The stripe + * locking code also uses it to hand off the stripe lock to the next + * pending IO. + */ + struct list_head plug_list; + + /* Flags that tell us if it is safe to merge with this bio. */ + unsigned long flags; + + /* + * Set if we're doing a parity rebuild for a read from higher up, which + * is handled differently from a parity rebuild as part of RMW. + */ + enum btrfs_rbio_ops operation; + + /* How many pages there are for the full stripe including P/Q */ + u16 nr_pages; + + /* How many sectors there are for the full stripe including P/Q */ + u16 nr_sectors; + + /* Number of data stripes (no p/q) */ + u8 nr_data; + + /* Numer of all stripes (including P/Q) */ + u8 real_stripes; + + /* How many pages there are for each stripe */ + u8 stripe_npages; + + /* How many sectors there are for each stripe */ + u8 stripe_nsectors; + + /* First bad stripe, -1 means no corruption */ + s8 faila; + + /* Second bad stripe (for RAID6 use) */ + s8 failb; + + /* Stripe number that we're scrubbing */ + u8 scrubp; + + /* + * Size of all the bios in the bio_list. This helps us decide if the + * rbio maps to a full stripe or not. + */ + int bio_list_bytes; + + int generic_bio_cnt; + + refcount_t refs; + + atomic_t stripes_pending; + + atomic_t error; + + struct work_struct end_io_work; + + /* Bitmap to record which horizontal stripe has data */ + unsigned long dbitmap; + + /* Allocated with stripe_nsectors-many bits for finish_*() calls */ + unsigned long finish_pbitmap; + + /* + * These are two arrays of pointers. We allocate the rbio big enough + * to hold them both and setup their locations when the rbio is + * allocated. + */ + + /* + * Pointers to pages that we allocated for reading/writing stripes + * directly from the disk (including P/Q). + */ + struct page **stripe_pages; + + /* Pointers to the sectors in the bio_list, for faster lookup */ + struct sector_ptr *bio_sectors; + + /* + * For subpage support, we need to map each sector to above + * stripe_pages. + */ + struct sector_ptr *stripe_sectors; + + /* Allocated with real_stripes-many pointers for finish_*() calls */ + void **finish_pointers; +}; + +/* + * For trace event usage only. Records useful debug info for each bio submitted + * by RAID56 to each physical device. + * + * No matter signed or not, (-1) is always the one indicating we can not grab + * the proper stripe number. + */ +struct raid56_bio_trace_info { + u64 devid; + + /* The offset inside the stripe. (<= STRIPE_LEN) */ + u32 offset; + + /* + * Stripe number. + * 0 is the first data stripe, and nr_data for P stripe, + * nr_data + 1 for Q stripe. + * >= real_stripes for + */ + u8 stripe_nr; +}; static inline int nr_data_stripes(const struct map_lookup *map) { - return map->num_stripes - nr_parity_stripes(map); + return map->num_stripes - btrfs_nr_parity_stripes(map->type); } + #define RAID5_P_STRIPE ((u64)-2) #define RAID6_Q_STRIPE ((u64)-1) #define is_parity_stripe(x) (((x) == RAID5_P_STRIPE) || \ ((x) == RAID6_Q_STRIPE)) -struct btrfs_raid_bio; struct btrfs_device; -int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len, int mirror_num, int generic_io); -int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, - u64 stripe_len); +void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, + int mirror_num, bool generic_io); +void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc); void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page, - u64 logical); + unsigned int pgoff, u64 logical); struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio, - struct btrfs_io_context *bioc, u64 stripe_len, + struct btrfs_io_context *bioc, struct btrfs_device *scrub_dev, unsigned long *dbitmap, int stripe_nsectors); void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio); struct btrfs_raid_bio * -raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc, - u64 length); +raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc); void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio); int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 998e3f180d90..9acf47b11fe6 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -5,6 +5,7 @@ #include "compression.h" #include "ctree.h" #include "delalloc-space.h" +#include "disk-io.h" #include "reflink.h" #include "transaction.h" #include "subpage.h" @@ -22,8 +23,10 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, int ret; inode_inc_iversion(inode); - if (!no_time_update) - inode->i_mtime = inode->i_ctime = current_time(inode); + if (!no_time_update) { + inode->i_mtime = current_time(inode); + inode->i_ctime = inode->i_mtime; + } /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -110,7 +113,6 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (comp_type == BTRFS_COMPRESS_NONE) { memcpy_to_page(page, offset_in_page(file_offset), data_start, datal); - flush_dcache_page(page); } else { ret = btrfs_decompress(comp_type, data_start, page, offset_in_page(file_offset), @@ -132,10 +134,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * * So what's in the range [500, 4095] corresponds to zeroes. */ - if (datal < block_size) { + if (datal < block_size) memzero_page(page, datal, block_size - datal); - flush_dcache_page(page); - } btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); btrfs_page_clear_checked(fs_info, page, file_offset, block_size); @@ -344,6 +344,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, int ret; const u64 len = olen_aligned; u64 last_dest_end = destoff; + u64 prev_extent_end = off; ret = -ENOMEM; buf = kvmalloc(fs_info->nodesize, GFP_KERNEL); @@ -363,7 +364,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, key.offset = off; while (1) { - u64 next_key_min_offset = key.offset + 1; struct btrfs_file_extent_item *extent; u64 extent_gen; int type; @@ -431,14 +431,21 @@ process_slot: * The first search might have left us at an extent item that * ends before our target range's start, can happen if we have * holes and NO_HOLES feature enabled. + * + * Subsequent searches may leave us on a file range we have + * processed before - this happens due to a race with ordered + * extent completion for a file range that is outside our source + * range, but that range was part of a file extent item that + * also covered a leading part of our source range. */ - if (key.offset + datal <= off) { + if (key.offset + datal <= prev_extent_end) { path->slots[0]++; goto process_slot; } else if (key.offset >= off + len) { break; } - next_key_min_offset = key.offset + datal; + + prev_extent_end = key.offset + datal; size = btrfs_item_size(leaf, slot); read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, slot), size); @@ -489,6 +496,7 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; + clone_info.update_times = !no_time_update; ret = btrfs_replace_file_extents(BTRFS_I(inode), path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); @@ -550,7 +558,7 @@ process_slot: break; btrfs_release_path(path); - key.offset = next_key_min_offset; + key.offset = prev_extent_end; if (fatal_signal_pending(current)) { ret = -EINTR; @@ -614,14 +622,23 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, struct inode *inode2, u64 loff2, u64 len) { + u64 range1_end = loff1 + len - 1; + u64 range2_end = loff2 + len - 1; + if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); + swap(range1_end, range2_end); } else if (inode1 == inode2 && loff2 < loff1) { swap(loff1, loff2); + swap(range1_end, range2_end); } - lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); + + lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end); + lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end); + + btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end); + btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end); } static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) @@ -641,7 +658,8 @@ static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { - const u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; + struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + const u64 bs = fs_info->sb->s_blocksize; int ret; /* @@ -652,6 +670,8 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -761,6 +781,8 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, round_down(destoff, PAGE_SIZE), round_up(destoff + len, PAGE_SIZE) - 1); + btrfs_btree_balance_dirty(fs_info); + return ret; } @@ -771,7 +793,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct inode *inode_in = file_inode(file_in); struct inode *inode_out = file_inode(file_out); u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize; - bool same_inode = inode_out == inode_in; u64 wb_len; int ret; @@ -810,15 +831,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, wb_len = ALIGN(*len, bs); /* - * Since we don't lock ranges, wait for ongoing lockless dio writes (as - * any in progress could create its ordered extents after we wait for - * existing ordered extents below). - */ - inode_dio_wait(inode_in); - if (!same_inode) - inode_dio_wait(inode_out); - - /* * Workaround to make sure NOCOW buffered write reach disk as NOCOW. * * Btrfs' back references do not have a block level granularity, they diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index fdc2c4b411f0..a6dc827e75af 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr) rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr); if (rb_node) { node = rb_entry(rb_node, struct mapping_node, rb_node); - root = (struct btrfs_root *)node->data; + root = node->data; } spin_unlock(&rc->reloc_root_tree.lock); return btrfs_grab_root(root); @@ -1101,7 +1101,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans, continue; /* - * if we are modifying block in fs tree, wait for readpage + * if we are modifying block in fs tree, wait for read_folio * to complete and drop the extent cache */ if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { @@ -1563,7 +1563,7 @@ static int invalidate_extent_cache(struct btrfs_root *root, end = (u64)-1; } - /* the lock_extent waits for readpage to complete */ + /* the lock_extent waits for read_folio to complete */ lock_extent(&BTRFS_I(inode)->io_tree, start, end); btrfs_drop_extent_cache(BTRFS_I(inode), start, end, 1); unlock_extent(&BTRFS_I(inode)->io_tree, start, end); @@ -2818,7 +2818,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * Subpage can't handle page with DIRTY but without UPTODATE * bit as it can lead to the following deadlock: * - * btrfs_readpage() + * btrfs_read_folio() * | Page already *locked* * |- btrfs_lock_and_flush_ordered_range() * |- btrfs_start_ordered_extent() @@ -2967,11 +2967,12 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, goto release_page; if (PageReadahead(page)) - page_cache_async_readahead(inode->i_mapping, ra, NULL, page, - page_index, last_index + 1 - page_index); + page_cache_async_readahead(inode->i_mapping, ra, NULL, + page_folio(page), page_index, + last_index + 1 - page_index); if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (!PageUptodate(page)) { ret = -EIO; @@ -2997,7 +2998,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, /* Reserve metadata for this range */ ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - clamped_len, clamped_len); + clamped_len, clamped_len, + false); if (ret) goto release_page; @@ -3845,8 +3847,7 @@ out: btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (err) { - if (inode) - iput(inode); + iput(inode); inode = ERR_PTR(err); } return inode; @@ -3977,6 +3978,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) if (!bg) return -ENOENT; + /* + * Relocation of a data block group creates ordered extents. Without + * sb_start_write(), we can freeze the filesystem while unfinished + * ordered extents are left. Such ordered extents can cause a deadlock + * e.g. when syncfs() is waiting for their completion but they can't + * finish because they block when joining a transaction, due to the + * fact that the freeze locks are being held in write mode. + */ + if (bg->flags & BTRFS_BLOCK_GROUP_DATA) + ASSERT(sb_write_started(fs_info->sb)); + if (btrfs_pinned_by_swapfile(fs_info, bg)) { btrfs_put_block_group(bg); return -ETXTBSY; diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index ca7426ef61c8..a64b26b16904 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, /* One for parent inode, two for dir entries */ qgroup_num_bytes = 3 * fs_info->nodesize; ret = btrfs_qgroup_reserve_meta_prealloc(root, - qgroup_num_bytes, true); + qgroup_num_bytes, true, + false); if (ret) return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 8cd713d37ad2..3afe5fa50a63 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -45,14 +45,14 @@ struct scrub_ctx; * operations. The first one configures an upper limit for the number * of (dynamically allocated) pages that are added to a bio. */ -#define SCRUB_PAGES_PER_BIO 32 /* 128KiB per bio for x86 */ -#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for x86 */ +#define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */ +#define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */ /* * The following value times PAGE_SIZE needs to be large enough to match the * largest node/leaf/sector size that shall be supported. */ -#define SCRUB_MAX_PAGES_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) +#define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K) struct scrub_recover { refcount_t refs; @@ -60,7 +60,7 @@ struct scrub_recover { u64 map_length; }; -struct scrub_page { +struct scrub_sector { struct scrub_block *sblock; struct page *page; struct btrfs_device *dev; @@ -87,16 +87,16 @@ struct scrub_bio { blk_status_t status; u64 logical; u64 physical; - struct scrub_page *pagev[SCRUB_PAGES_PER_BIO]; - int page_count; + struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO]; + int sector_count; int next_free; - struct btrfs_work work; + struct work_struct work; }; struct scrub_block { - struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK]; - int page_count; - atomic_t outstanding_pages; + struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK]; + int sector_count; + atomic_t outstanding_sectors; refcount_t refs; /* free mem on transition to zero */ struct scrub_ctx *sctx; struct scrub_parity *sparity; @@ -110,7 +110,7 @@ struct scrub_block { /* It is for the data with checksum */ unsigned int data_corrected:1; }; - struct btrfs_work work; + struct work_struct work; }; /* Used for the chunks with parity stripe such RAID5/6 */ @@ -129,21 +129,19 @@ struct scrub_parity { refcount_t refs; - struct list_head spages; + struct list_head sectors_list; /* Work of parity check and repair */ - struct btrfs_work work; + struct work_struct work; /* Mark the parity blocks which have data */ - unsigned long *dbitmap; + unsigned long dbitmap; /* * Mark the parity blocks which have data, but errors happen when * read data or check data */ - unsigned long *ebitmap; - - unsigned long bitmap[]; + unsigned long ebitmap; }; struct scrub_ctx { @@ -158,7 +156,7 @@ struct scrub_ctx { struct list_head csum_list; atomic_t cancel_req; int readonly; - int pages_per_bio; + int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -212,43 +210,43 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_checksum(struct scrub_block *sblock); static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good); -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good, - int page_num, int force_write); + int sector_num, int force_write); static void scrub_write_block_to_dev_replace(struct scrub_block *sblock); -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num); +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, + int sector_num); static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); -static void scrub_page_get(struct scrub_page *spage); -static void scrub_page_put(struct scrub_page *spage); +static void scrub_sector_get(struct scrub_sector *sector); +static void scrub_sector_put(struct scrub_sector *sector); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, - u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u8 *csum, - u64 physical_for_dev_replace); +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, + u64 physical, struct btrfs_device *dev, u64 flags, + u64 gen, int mirror_num, u8 *csum, + u64 physical_for_dev_replace); static void scrub_bio_end_io(struct bio *bio); -static void scrub_bio_end_io_worker(struct btrfs_work *work); +static void scrub_bio_end_io_worker(struct work_struct *work); static void scrub_block_complete(struct scrub_block *sblock); -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num); -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num); +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector); static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); +static void scrub_wr_bio_end_io_worker(struct work_struct *work); static void scrub_put_ctx(struct scrub_ctx *sctx); -static inline int scrub_is_page_on_raid56(struct scrub_page *spage) +static inline int scrub_is_page_on_raid56(struct scrub_sector *sector) { - return spage->recover && - (spage->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); + return sector->recover && + (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK); } static void scrub_pending_bio_inc(struct scrub_ctx *sctx) @@ -535,9 +533,9 @@ static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx) if (sctx->curr != -1) { struct scrub_bio *sbio = sctx->bios[sctx->curr]; - for (i = 0; i < sbio->page_count; i++) { - WARN_ON(!sbio->pagev[i]->page); - scrub_block_put(sbio->pagev[i]->sblock); + for (i = 0; i < sbio->sector_count; i++) { + WARN_ON(!sbio->sectors[i]->page); + scrub_block_put(sbio->sectors[i]->sblock); } bio_put(sbio->bio); } @@ -572,7 +570,7 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( goto nomem; refcount_set(&sctx->refs, 1); sctx->is_dev_replace = is_dev_replace; - sctx->pages_per_bio = SCRUB_PAGES_PER_BIO; + sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO; sctx->curr = -1; sctx->fs_info = fs_info; INIT_LIST_HEAD(&sctx->csum_list); @@ -586,9 +584,8 @@ static noinline_for_stack struct scrub_ctx *scrub_setup_ctx( sbio->index = i; sbio->sctx = sctx; - sbio->page_count = 0; - btrfs_init_work(&sbio->work, scrub_bio_end_io_worker, NULL, - NULL); + sbio->sector_count = 0; + INIT_WORK(&sbio->work, scrub_bio_end_io_worker); if (i != SCRUB_BIOS_PER_SCTX - 1) sctx->bios[i]->next_free = i + 1; @@ -728,16 +725,16 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u8 ref_level = 0; int ret; - WARN_ON(sblock->page_count < 1); - dev = sblock->pagev[0]->dev; + WARN_ON(sblock->sector_count < 1); + dev = sblock->sectors[0]->dev; fs_info = sblock->sctx->fs_info; path = btrfs_alloc_path(); if (!path) return; - swarn.physical = sblock->pagev[0]->physical; - swarn.logical = sblock->pagev[0]->logical; + swarn.physical = sblock->sectors[0]->physical; + swarn.logical = sblock->sectors[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; @@ -798,8 +795,8 @@ static inline void scrub_put_recover(struct btrfs_fs_info *fs_info, /* * scrub_handle_errored_block gets called when either verification of the - * pages failed or the bio failed to read, e.g. with EIO. In the latter - * case, this function handles all pages in the bio, even though only one + * sectors failed or the bio failed to read, e.g. with EIO. In the latter + * case, this function handles all sectors in the bio, even though only one * may be bad. * The goal of this function is to repair the errored block by using the * contents of one of the mirrors. @@ -817,16 +814,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) struct scrub_block *sblock_bad; int ret; int mirror_index; - int page_num; + int sector_num; int success; bool full_stripe_locked; unsigned int nofs_flag; static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - BUG_ON(sblock_to_check->page_count < 1); + BUG_ON(sblock_to_check->sector_count < 1); fs_info = sctx->fs_info; - if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { + if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) { /* * if we find an error in a super block, we just report it. * They will get written with the next transaction commit @@ -837,13 +834,13 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_unlock(&sctx->stat_lock); return 0; } - logical = sblock_to_check->pagev[0]->logical; - BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1); - failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1; - is_metadata = !(sblock_to_check->pagev[0]->flags & + logical = sblock_to_check->sectors[0]->logical; + BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1); + failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1; + is_metadata = !(sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA); - have_csum = sblock_to_check->pagev[0]->have_csum; - dev = sblock_to_check->pagev[0]->dev; + have_csum = sblock_to_check->sectors[0]->have_csum; + dev = sblock_to_check->sectors[0]->dev; if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical)) return 0; @@ -854,7 +851,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * might be waiting the scrub task to pause (which needs to wait for all * the worker tasks to complete before pausing). * We do allocations in the workers through insert_full_stripe_lock() - * and scrub_add_page_to_wr_bio(), which happens down the call chain of + * and scrub_add_sector_to_wr_bio(), which happens down the call chain of * this function. */ nofs_flag = memalloc_nofs_save(); @@ -918,7 +915,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) goto out; } - /* setup the context, map the logical blocks and alloc the pages */ + /* Setup the context, map the logical blocks and alloc the sectors */ ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck); if (ret) { spin_lock(&sctx->stat_lock); @@ -937,7 +934,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (!sblock_bad->header_error && !sblock_bad->checksum_error && sblock_bad->no_io_error_seen) { /* - * the error disappeared after reading page by page, or + * The error disappeared after reading sector by sector, or * the area was part of a huge bio and other parts of the * bio caused I/O errors, or the block layer merged several * read requests into one and the error is caused by a @@ -998,10 +995,10 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * that is known to contain an error is rewritten. Afterwards * the block is known to be corrected. * If a mirror is found which is completely correct, and no - * checksum is present, only those pages are rewritten that had + * checksum is present, only those sectors are rewritten that had * an I/O error in the block to be repaired, since it cannot be - * determined, which copy of the other pages is better (and it - * could happen otherwise that a correct page would be + * determined, which copy of the other sectors is better (and it + * could happen otherwise that a correct sector would be * overwritten by a bad one). */ for (mirror_index = 0; ;mirror_index++) { @@ -1011,25 +1008,25 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) continue; /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */ - if (!scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) { if (mirror_index >= BTRFS_MAX_MIRRORS) break; - if (!sblocks_for_recheck[mirror_index].page_count) + if (!sblocks_for_recheck[mirror_index].sector_count) break; sblock_other = sblocks_for_recheck + mirror_index; } else { - struct scrub_recover *r = sblock_bad->pagev[0]->recover; + struct scrub_recover *r = sblock_bad->sectors[0]->recover; int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs; if (mirror_index >= max_allowed) break; - if (!sblocks_for_recheck[1].page_count) + if (!sblocks_for_recheck[1].sector_count) break; ASSERT(failed_mirror_index == 0); sblock_other = sblocks_for_recheck + 1; - sblock_other->pagev[0]->mirror_num = 1 + mirror_index; + sblock_other->sectors[0]->mirror_num = 1 + mirror_index; } /* build and submit the bios, check checksums */ @@ -1078,16 +1075,16 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * area are unreadable. */ success = 1; - for (page_num = 0; page_num < sblock_bad->page_count; - page_num++) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; + for (sector_num = 0; sector_num < sblock_bad->sector_count; + sector_num++) { + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; struct scrub_block *sblock_other = NULL; - /* skip no-io-error page in scrub */ - if (!spage_bad->io_error && !sctx->is_dev_replace) + /* Skip no-io-error sectors in scrub */ + if (!sector_bad->io_error && !sctx->is_dev_replace) continue; - if (scrub_is_page_on_raid56(sblock_bad->pagev[0])) { + if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) { /* * In case of dev replace, if raid56 rebuild process * didn't work out correct data, then copy the content @@ -1096,14 +1093,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) * sblock_for_recheck array to target device. */ sblock_other = NULL; - } else if (spage_bad->io_error) { - /* try to find no-io-error page in mirrors */ + } else if (sector_bad->io_error) { + /* Try to find no-io-error sector in mirrors */ for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS && - sblocks_for_recheck[mirror_index].page_count > 0; + sblocks_for_recheck[mirror_index].sector_count > 0; mirror_index++) { if (!sblocks_for_recheck[mirror_index]. - pagev[page_num]->io_error) { + sectors[sector_num]->io_error) { sblock_other = sblocks_for_recheck + mirror_index; break; @@ -1115,27 +1112,26 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) if (sctx->is_dev_replace) { /* - * did not find a mirror to fetch the page - * from. scrub_write_page_to_dev_replace() - * handles this case (page->io_error), by - * filling the block with zeros before - * submitting the write request + * Did not find a mirror to fetch the sector from. + * scrub_write_sector_to_dev_replace() handles this + * case (sector->io_error), by filling the block with + * zeros before submitting the write request */ if (!sblock_other) sblock_other = sblock_bad; - if (scrub_write_page_to_dev_replace(sblock_other, - page_num) != 0) { + if (scrub_write_sector_to_dev_replace(sblock_other, + sector_num) != 0) { atomic64_inc( &fs_info->dev_replace.num_write_errors); success = 0; } } else if (sblock_other) { - ret = scrub_repair_page_from_good_copy(sblock_bad, - sblock_other, - page_num, 0); + ret = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_other, + sector_num, 0); if (0 == ret) - spage_bad->io_error = 0; + sector_bad->io_error = 0; else success = 0; } @@ -1186,18 +1182,16 @@ out: struct scrub_block *sblock = sblocks_for_recheck + mirror_index; struct scrub_recover *recover; - int page_index; + int i; - for (page_index = 0; page_index < sblock->page_count; - page_index++) { - sblock->pagev[page_index]->sblock = NULL; - recover = sblock->pagev[page_index]->recover; + for (i = 0; i < sblock->sector_count; i++) { + sblock->sectors[i]->sblock = NULL; + recover = sblock->sectors[i]->recover; if (recover) { scrub_put_recover(fs_info, recover); - sblock->pagev[page_index]->recover = - NULL; + sblock->sectors[i]->recover = NULL; } - scrub_page_put(sblock->pagev[page_index]); + scrub_sector_put(sblock->sectors[i]); } } kfree(sblocks_for_recheck); @@ -1222,7 +1216,6 @@ static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc) static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, u64 *raid_map, - u64 mapped_length, int nstripes, int mirror, int *stripe_index, u64 *stripe_offset) @@ -1237,7 +1230,7 @@ static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type, continue; if (logical >= raid_map[i] && - logical < raid_map[i] + mapped_length) + logical < raid_map[i] + BTRFS_STRIPE_LEN) break; } @@ -1255,26 +1248,25 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, { struct scrub_ctx *sctx = original_sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = original_sblock->page_count * fs_info->sectorsize; - u64 logical = original_sblock->pagev[0]->logical; - u64 generation = original_sblock->pagev[0]->generation; - u64 flags = original_sblock->pagev[0]->flags; - u64 have_csum = original_sblock->pagev[0]->have_csum; + u64 length = original_sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = original_sblock->sectors[0]->logical; + u64 generation = original_sblock->sectors[0]->generation; + u64 flags = original_sblock->sectors[0]->flags; + u64 have_csum = original_sblock->sectors[0]->have_csum; struct scrub_recover *recover; struct btrfs_io_context *bioc; u64 sublen; u64 mapped_length; u64 stripe_offset; int stripe_index; - int page_index = 0; + int sector_index = 0; int mirror_index; int nmirrors; int ret; /* - * note: the two members refs and outstanding_pages - * are not used (and not set) in the blocks that are used for - * the recheck procedure + * Note: the two members refs and outstanding_sectors are not used (and + * not set) in the blocks that are used for the recheck procedure. */ while (length > 0) { @@ -1306,20 +1298,20 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock, recover->bioc = bioc; recover->map_length = mapped_length; - ASSERT(page_index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK); nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS); for (mirror_index = 0; mirror_index < nmirrors; mirror_index++) { struct scrub_block *sblock; - struct scrub_page *spage; + struct scrub_sector *sector; sblock = sblocks_for_recheck + mirror_index; sblock->sctx = sctx; - spage = kzalloc(sizeof(*spage), GFP_NOFS); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_NOFS); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -1327,49 +1319,48 @@ leave_nomem: scrub_put_recover(fs_info, recover); return -ENOMEM; } - scrub_page_get(spage); - sblock->pagev[page_index] = spage; - spage->sblock = sblock; - spage->flags = flags; - spage->generation = generation; - spage->logical = logical; - spage->have_csum = have_csum; + scrub_sector_get(sector); + sblock->sectors[sector_index] = sector; + sector->sblock = sblock; + sector->flags = flags; + sector->generation = generation; + sector->logical = logical; + sector->have_csum = have_csum; if (have_csum) - memcpy(spage->csum, - original_sblock->pagev[0]->csum, + memcpy(sector->csum, + original_sblock->sectors[0]->csum, sctx->fs_info->csum_size); scrub_stripe_index_and_offset(logical, bioc->map_type, bioc->raid_map, - mapped_length, bioc->num_stripes - bioc->num_tgtdevs, mirror_index, &stripe_index, &stripe_offset); - spage->physical = bioc->stripes[stripe_index].physical + + sector->physical = bioc->stripes[stripe_index].physical + stripe_offset; - spage->dev = bioc->stripes[stripe_index].dev; + sector->dev = bioc->stripes[stripe_index].dev; - BUG_ON(page_index >= original_sblock->page_count); - spage->physical_for_dev_replace = - original_sblock->pagev[page_index]-> + BUG_ON(sector_index >= original_sblock->sector_count); + sector->physical_for_dev_replace = + original_sblock->sectors[sector_index]-> physical_for_dev_replace; - /* for missing devices, dev->bdev is NULL */ - spage->mirror_num = mirror_index + 1; - sblock->page_count++; - spage->page = alloc_page(GFP_NOFS); - if (!spage->page) + /* For missing devices, dev->bdev is NULL */ + sector->mirror_num = mirror_index + 1; + sblock->sector_count++; + sector->page = alloc_page(GFP_NOFS); + if (!sector->page) goto leave_nomem; scrub_get_recover(recover); - spage->recover = recover; + sector->recover = recover; } scrub_put_recover(fs_info, recover); length -= sublen; logical += sublen; - page_index++; + sector_index++; } return 0; @@ -1382,22 +1373,15 @@ static void scrub_bio_wait_endio(struct bio *bio) static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, struct bio *bio, - struct scrub_page *spage) + struct scrub_sector *sector) { DECLARE_COMPLETION_ONSTACK(done); - int ret; - int mirror_num; - bio->bi_iter.bi_sector = spage->logical >> 9; + bio->bi_iter.bi_sector = sector->logical >> 9; bio->bi_private = &done; bio->bi_end_io = scrub_bio_wait_endio; - - mirror_num = spage->sblock->pagev[0]->mirror_num; - ret = raid56_parity_recover(bio, spage->recover->bioc, - spage->recover->map_length, - mirror_num, 0); - if (ret) - return ret; + raid56_parity_recover(bio, sector->recover->bioc, + sector->sblock->sectors[0]->mirror_num, false); wait_for_completion_io(&done); return blk_status_to_errno(bio->bi_status); @@ -1406,26 +1390,25 @@ static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info, static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, struct scrub_block *sblock) { - struct scrub_page *first_page = sblock->pagev[0]; + struct scrub_sector *first_sector = sblock->sectors[0]; struct bio *bio; - int page_num; + int i; - /* All pages in sblock belong to the same stripe on the same device. */ - ASSERT(first_page->dev); - if (!first_page->dev->bdev) + /* All sectors in sblock belong to the same stripe on the same device. */ + ASSERT(first_sector->dev); + if (!first_sector->dev->bdev) goto out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); - bio_set_dev(bio, first_page->dev->bdev); + bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - WARN_ON(!spage->page); - bio_add_page(bio, spage->page, PAGE_SIZE, 0); + WARN_ON(!sector->page); + bio_add_page(bio, sector->page, PAGE_SIZE, 0); } - if (scrub_submit_raid56_bio_wait(fs_info, bio, first_page)) { + if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) { bio_put(bio); goto out; } @@ -1436,65 +1419,63 @@ static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info, return; out: - for (page_num = 0; page_num < sblock->page_count; page_num++) - sblock->pagev[page_num]->io_error = 1; + for (i = 0; i < sblock->sector_count; i++) + sblock->sectors[i]->io_error = 1; sblock->no_io_error_seen = 0; } /* - * this function will check the on disk data for checksum errors, header - * errors and read I/O errors. If any I/O errors happen, the exact pages - * which are errored are marked as being bad. The goal is to enable scrub - * to take those pages that are not errored from all the mirrors so that - * the pages that are errored in the just handled mirror can be repaired. + * This function will check the on disk data for checksum errors, header errors + * and read I/O errors. If any I/O errors happen, the exact sectors which are + * errored are marked as being bad. The goal is to enable scrub to take those + * sectors that are not errored from all the mirrors so that the sectors that + * are errored in the just handled mirror can be repaired. */ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int retry_failed_mirror) { - int page_num; + int i; sblock->no_io_error_seen = 1; /* short cut for raid56 */ - if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->pagev[0])) + if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0])) return scrub_recheck_block_on_raid56(fs_info, sblock); - for (page_num = 0; page_num < sblock->page_count; page_num++) { - struct bio *bio; - struct scrub_page *spage = sblock->pagev[page_num]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; + struct bio bio; + struct bio_vec bvec; - if (spage->dev->bdev == NULL) { - spage->io_error = 1; + if (sector->dev->bdev == NULL) { + sector->io_error = 1; sblock->no_io_error_seen = 0; continue; } - WARN_ON(!spage->page); - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage->dev->bdev); - - bio_add_page(bio, spage->page, fs_info->sectorsize, 0); - bio->bi_iter.bi_sector = spage->physical >> 9; - bio->bi_opf = REQ_OP_READ; + WARN_ON(!sector->page); + bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ); + bio_add_page(&bio, sector->page, fs_info->sectorsize, 0); + bio.bi_iter.bi_sector = sector->physical >> 9; - if (btrfsic_submit_bio_wait(bio)) { - spage->io_error = 1; + btrfsic_check_bio(&bio); + if (submit_bio_wait(&bio)) { + sector->io_error = 1; sblock->no_io_error_seen = 0; } - bio_put(bio); + bio_uninit(&bio); } if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); } -static inline int scrub_check_fsid(u8 fsid[], - struct scrub_page *spage) +static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector) { - struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices; int ret; ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -1507,7 +1488,7 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) sblock->checksum_error = 0; sblock->generation_error = 0; - if (sblock->pagev[0]->flags & BTRFS_EXTENT_FLAG_DATA) + if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA) scrub_checksum_data(sblock); else scrub_checksum_tree_block(sblock); @@ -1516,15 +1497,14 @@ static void scrub_recheck_block_checksum(struct scrub_block *sblock) static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, struct scrub_block *sblock_good) { - int page_num; + int i; int ret = 0; - for (page_num = 0; page_num < sblock_bad->page_count; page_num++) { + for (i = 0; i < sblock_bad->sector_count; i++) { int ret_sub; - ret_sub = scrub_repair_page_from_good_copy(sblock_bad, - sblock_good, - page_num, 1); + ret_sub = scrub_repair_sector_from_good_copy(sblock_bad, + sblock_good, i, 1); if (ret_sub) ret = ret_sub; } @@ -1532,47 +1512,43 @@ static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad, return ret; } -static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, - struct scrub_block *sblock_good, - int page_num, int force_write) +static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad, + struct scrub_block *sblock_good, + int sector_num, int force_write) { - struct scrub_page *spage_bad = sblock_bad->pagev[page_num]; - struct scrub_page *spage_good = sblock_good->pagev[page_num]; + struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num]; + struct scrub_sector *sector_good = sblock_good->sectors[sector_num]; struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info; const u32 sectorsize = fs_info->sectorsize; - BUG_ON(spage_bad->page == NULL); - BUG_ON(spage_good->page == NULL); + BUG_ON(sector_bad->page == NULL); + BUG_ON(sector_good->page == NULL); if (force_write || sblock_bad->header_error || - sblock_bad->checksum_error || spage_bad->io_error) { - struct bio *bio; + sblock_bad->checksum_error || sector_bad->io_error) { + struct bio bio; + struct bio_vec bvec; int ret; - if (!spage_bad->dev->bdev) { + if (!sector_bad->dev->bdev) { btrfs_warn_rl(fs_info, "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected"); return -EIO; } - bio = btrfs_bio_alloc(1); - bio_set_dev(bio, spage_bad->dev->bdev); - bio->bi_iter.bi_sector = spage_bad->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE); + bio.bi_iter.bi_sector = sector_bad->physical >> 9; + __bio_add_page(&bio, sector_good->page, sectorsize, 0); - ret = bio_add_page(bio, spage_good->page, sectorsize, 0); - if (ret != sectorsize) { - bio_put(bio); - return -EIO; - } + btrfsic_check_bio(&bio); + ret = submit_bio_wait(&bio); + bio_uninit(&bio); - if (btrfsic_submit_bio_wait(bio)) { - btrfs_dev_stat_inc_and_print(spage_bad->dev, + if (ret) { + btrfs_dev_stat_inc_and_print(sector_bad->dev, BTRFS_DEV_STAT_WRITE_ERRS); atomic64_inc(&fs_info->dev_replace.num_write_errors); - bio_put(bio); return -EIO; } - bio_put(bio); } return 0; @@ -1581,7 +1557,7 @@ static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad, static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) { struct btrfs_fs_info *fs_info = sblock->sctx->fs_info; - int page_num; + int i; /* * This block is used for the check of the parity on the source device, @@ -1590,25 +1566,24 @@ static void scrub_write_block_to_dev_replace(struct scrub_block *sblock) if (sblock->sparity) return; - for (page_num = 0; page_num < sblock->page_count; page_num++) { + for (i = 0; i < sblock->sector_count; i++) { int ret; - ret = scrub_write_page_to_dev_replace(sblock, page_num); + ret = scrub_write_sector_to_dev_replace(sblock, i); if (ret) atomic64_inc(&fs_info->dev_replace.num_write_errors); } } -static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, - int page_num) +static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num) { - struct scrub_page *spage = sblock->pagev[page_num]; + struct scrub_sector *sector = sblock->sectors[sector_num]; - BUG_ON(spage->page == NULL); - if (spage->io_error) - clear_page(page_address(spage->page)); + BUG_ON(sector->page == NULL); + if (sector->io_error) + clear_page(page_address(sector->page)); - return scrub_add_page_to_wr_bio(sblock->sctx, spage); + return scrub_add_sector_to_wr_bio(sblock->sctx, sector); } static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) @@ -1633,8 +1608,8 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { struct scrub_bio *sbio; int ret; @@ -1650,45 +1625,38 @@ again: return -ENOMEM; } sctx->wr_curr_bio->sctx = sctx; - sctx->wr_curr_bio->page_count = 0; + sctx->wr_curr_bio->sector_count = 0; } sbio = sctx->wr_curr_bio; - if (sbio->page_count == 0) { - struct bio *bio; - - ret = fill_writer_pointer_gap(sctx, - spage->physical_for_dev_replace); + if (sbio->sector_count == 0) { + ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace); if (ret) { mutex_unlock(&sctx->wr_lock); return ret; } - sbio->physical = spage->physical_for_dev_replace; - sbio->logical = spage->logical; + sbio->physical = sector->physical_for_dev_replace; + sbio->logical = sector->logical; sbio->dev = sctx->wr_tgtdev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_WRITE, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_wr_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_WRITE; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_wr_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical_for_dev_replace || - sbio->logical + sbio->page_count * sectorsize != - spage->logical) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sector->physical_for_dev_replace || + sbio->logical + sbio->sector_count * sectorsize != + sector->logical) { scrub_wr_submit(sctx); goto again; } - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; mutex_unlock(&sctx->wr_lock); @@ -1698,10 +1666,10 @@ again: goto again; } - sbio->pagev[sbio->page_count] = spage; - scrub_page_get(spage); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + sbio->sectors[sbio->sector_count] = sector; + scrub_sector_get(sector); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_wr_submit(sctx); mutex_unlock(&sctx->wr_lock); @@ -1717,16 +1685,16 @@ static void scrub_wr_submit(struct scrub_ctx *sctx) sbio = sctx->wr_curr_bio; sctx->wr_curr_bio = NULL; - WARN_ON(!sbio->bio->bi_bdev); scrub_pending_bio_inc(sctx); /* process all writes in a single worker thread. Then the block layer * orders the requests before sending them to the driver which * doubled the write performance on spinning disks when measured * with Linux 3.5 */ - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); if (btrfs_is_zoned(sctx->fs_info)) - sctx->write_pointer = sbio->physical + sbio->page_count * + sctx->write_pointer = sbio->physical + sbio->sector_count * sctx->fs_info->sectorsize; } @@ -1738,31 +1706,31 @@ static void scrub_wr_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL); - btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); + INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker); + queue_work(fs_info->scrub_wr_completion_workers, &sbio->work); } -static void scrub_wr_bio_end_io_worker(struct btrfs_work *work) +static void scrub_wr_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { struct btrfs_dev_replace *dev_replace = &sbio->sctx->fs_info->dev_replace; - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; + sector->io_error = 1; atomic64_inc(&dev_replace->num_write_errors); } } - for (i = 0; i < sbio->page_count; i++) - scrub_page_put(sbio->pagev[i]); + for (i = 0; i < sbio->sector_count; i++) + scrub_sector_put(sbio->sectors[i]); bio_put(sbio->bio); kfree(sbio); @@ -1786,8 +1754,8 @@ static int scrub_checksum(struct scrub_block *sblock) sblock->generation_error = 0; sblock->checksum_error = 0; - WARN_ON(sblock->page_count < 1); - flags = sblock->pagev[0]->flags; + WARN_ON(sblock->sector_count < 1); + flags = sblock->sectors[0]->flags; ret = 0; if (flags & BTRFS_EXTENT_FLAG_DATA) ret = scrub_checksum_data(sblock); @@ -1809,26 +1777,26 @@ static int scrub_checksum_data(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - if (!spage->have_csum) + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + if (!sector->have_csum) return 0; - kaddr = page_address(spage->page); + kaddr = page_address(sector->page); shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); /* - * In scrub_pages() and scrub_pages_for_parity() we ensure each spage + * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector * only contains one sector of data. */ crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - if (memcmp(csum, spage->csum, fs_info->csum_size)) + if (memcmp(csum, sector->csum, fs_info->csum_size)) sblock->checksum_error = 1; return sblock->checksum_error; } @@ -1849,16 +1817,16 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) const u32 sectorsize = sctx->fs_info->sectorsize; const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits; int i; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; - BUG_ON(sblock->page_count < 1); + BUG_ON(sblock->sector_count < 1); - /* Each member in pagev is just one block, not a full page */ - ASSERT(sblock->page_count == num_sectors); + /* Each member in sectors is just one sector */ + ASSERT(sblock->sector_count == num_sectors); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + sector = sblock->sectors[0]; + kaddr = page_address(sector->page); h = (struct btrfs_header *)kaddr; memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size); @@ -1867,15 +1835,15 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) * a) don't have an extent buffer and * b) the page is already kmapped */ - if (spage->logical != btrfs_stack_header_bytenr(h)) + if (sector->logical != btrfs_stack_header_bytenr(h)) sblock->header_error = 1; - if (spage->generation != btrfs_stack_header_generation(h)) { + if (sector->generation != btrfs_stack_header_generation(h)) { sblock->header_error = 1; sblock->generation_error = 1; } - if (!scrub_check_fsid(h->fsid, spage)) + if (!scrub_check_fsid(h->fsid, sector)) sblock->header_error = 1; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, @@ -1888,7 +1856,7 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) sectorsize - BTRFS_CSUM_SIZE); for (i = 1; i < num_sectors; i++) { - kaddr = page_address(sblock->pagev[i]->page); + kaddr = page_address(sblock->sectors[i]->page); crypto_shash_update(shash, kaddr, sectorsize); } @@ -1906,23 +1874,23 @@ static int scrub_checksum_super(struct scrub_block *sblock) struct btrfs_fs_info *fs_info = sctx->fs_info; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct scrub_page *spage; + struct scrub_sector *sector; char *kaddr; int fail_gen = 0; int fail_cor = 0; - BUG_ON(sblock->page_count < 1); - spage = sblock->pagev[0]; - kaddr = page_address(spage->page); + BUG_ON(sblock->sector_count < 1); + sector = sblock->sectors[0]; + kaddr = page_address(sector->page); s = (struct btrfs_super_block *)kaddr; - if (spage->logical != btrfs_super_bytenr(s)) + if (sector->logical != btrfs_super_bytenr(s)) ++fail_cor; - if (spage->generation != btrfs_super_generation(s)) + if (sector->generation != btrfs_super_generation(s)) ++fail_gen; - if (!scrub_check_fsid(s->fsid, spage)) + if (!scrub_check_fsid(s->fsid, sector)) ++fail_cor; shash->tfm = fs_info->csum_shash; @@ -1943,10 +1911,10 @@ static int scrub_checksum_super(struct scrub_block *sblock) ++sctx->stat.super_errors; spin_unlock(&sctx->stat_lock); if (fail_cor) - btrfs_dev_stat_inc_and_print(spage->dev, + btrfs_dev_stat_inc_and_print(sector->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); else - btrfs_dev_stat_inc_and_print(spage->dev, + btrfs_dev_stat_inc_and_print(sector->dev, BTRFS_DEV_STAT_GENERATION_ERRS); } @@ -1966,23 +1934,23 @@ static void scrub_block_put(struct scrub_block *sblock) if (sblock->sparity) scrub_parity_put(sblock->sparity); - for (i = 0; i < sblock->page_count; i++) - scrub_page_put(sblock->pagev[i]); + for (i = 0; i < sblock->sector_count; i++) + scrub_sector_put(sblock->sectors[i]); kfree(sblock); } } -static void scrub_page_get(struct scrub_page *spage) +static void scrub_sector_get(struct scrub_sector *sector) { - atomic_inc(&spage->refs); + atomic_inc(§or->refs); } -static void scrub_page_put(struct scrub_page *spage) +static void scrub_sector_put(struct scrub_sector *sector) { - if (atomic_dec_and_test(&spage->refs)) { - if (spage->page) - __free_page(spage->page); - kfree(spage); + if (atomic_dec_and_test(§or->refs)) { + if (sector->page) + __free_page(sector->page); + kfree(sector); } } @@ -2057,13 +2025,14 @@ static void scrub_submit(struct scrub_ctx *sctx) sbio = sctx->bios[sctx->curr]; sctx->curr = -1; scrub_pending_bio_inc(sctx); - btrfsic_submit_bio(sbio->bio); + btrfsic_check_bio(sbio->bio); + submit_bio(sbio->bio); } -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage) +static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx, + struct scrub_sector *sector) { - struct scrub_block *sblock = spage->sblock; + struct scrub_block *sblock = sector->sblock; struct scrub_bio *sbio; const u32 sectorsize = sctx->fs_info->sectorsize; int ret; @@ -2078,7 +2047,7 @@ again: if (sctx->curr != -1) { sctx->first_free = sctx->bios[sctx->curr]->next_free; sctx->bios[sctx->curr]->next_free = -1; - sctx->bios[sctx->curr]->page_count = 0; + sctx->bios[sctx->curr]->sector_count = 0; spin_unlock(&sctx->list_lock); } else { spin_unlock(&sctx->list_lock); @@ -2086,37 +2055,31 @@ again: } } sbio = sctx->bios[sctx->curr]; - if (sbio->page_count == 0) { - struct bio *bio; - - sbio->physical = spage->physical; - sbio->logical = spage->logical; - sbio->dev = spage->dev; - bio = sbio->bio; - if (!bio) { - bio = btrfs_bio_alloc(sctx->pages_per_bio); - sbio->bio = bio; + if (sbio->sector_count == 0) { + sbio->physical = sector->physical; + sbio->logical = sector->logical; + sbio->dev = sector->dev; + if (!sbio->bio) { + sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio, + REQ_OP_READ, GFP_NOFS); } - - bio->bi_private = sbio; - bio->bi_end_io = scrub_bio_end_io; - bio_set_dev(bio, sbio->dev->bdev); - bio->bi_iter.bi_sector = sbio->physical >> 9; - bio->bi_opf = REQ_OP_READ; + sbio->bio->bi_private = sbio; + sbio->bio->bi_end_io = scrub_bio_end_io; + sbio->bio->bi_iter.bi_sector = sbio->physical >> 9; sbio->status = 0; - } else if (sbio->physical + sbio->page_count * sectorsize != - spage->physical || - sbio->logical + sbio->page_count * sectorsize != - spage->logical || - sbio->dev != spage->dev) { + } else if (sbio->physical + sbio->sector_count * sectorsize != + sector->physical || + sbio->logical + sbio->sector_count * sectorsize != + sector->logical || + sbio->dev != sector->dev) { scrub_submit(sctx); goto again; } - sbio->pagev[sbio->page_count] = spage; - ret = bio_add_page(sbio->bio, spage->page, sectorsize, 0); + sbio->sectors[sbio->sector_count] = sector; + ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0); if (ret != sectorsize) { - if (sbio->page_count < 1) { + if (sbio->sector_count < 1) { bio_put(sbio->bio); sbio->bio = NULL; return -EIO; @@ -2126,9 +2089,9 @@ again: } scrub_block_get(sblock); /* one for the page added to the bio */ - atomic_inc(&sblock->outstanding_pages); - sbio->page_count++; - if (sbio->page_count == sctx->pages_per_bio) + atomic_inc(&sblock->outstanding_sectors); + sbio->sector_count++; + if (sbio->sector_count == sctx->sectors_per_bio) scrub_submit(sctx); return 0; @@ -2144,10 +2107,10 @@ static void scrub_missing_raid56_end_io(struct bio *bio) bio_put(bio); - btrfs_queue_work(fs_info->scrub_workers, &sblock->work); + queue_work(fs_info->scrub_workers, &sblock->work); } -static void scrub_missing_raid56_worker(struct btrfs_work *work) +static void scrub_missing_raid56_worker(struct work_struct *work) { struct scrub_block *sblock = container_of(work, struct scrub_block, work); struct scrub_ctx *sctx = sblock->sctx; @@ -2155,8 +2118,8 @@ static void scrub_missing_raid56_worker(struct btrfs_work *work) u64 logical; struct btrfs_device *dev; - logical = sblock->pagev[0]->logical; - dev = sblock->pagev[0]->dev; + logical = sblock->sectors[0]->logical; + dev = sblock->sectors[0]->dev; if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); @@ -2193,8 +2156,8 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) { struct scrub_ctx *sctx = sblock->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; - u64 length = sblock->page_count * PAGE_SIZE; - u64 logical = sblock->pagev[0]->logical; + u64 length = sblock->sector_count << fs_info->sectorsize_bits; + u64 logical = sblock->sectors[0]->logical; struct btrfs_io_context *bioc = NULL; struct bio *bio; struct btrfs_raid_bio *rbio; @@ -2213,27 +2176,31 @@ static void scrub_missing_raid56_pages(struct scrub_block *sblock) * We shouldn't be scrubbing a missing device. Even for dev * replace, we should only get here for RAID 5/6. We either * managed to mount something with no mirrors remaining or - * there's a bug in scrub_remap_extent()/btrfs_map_block(). + * there's a bug in scrub_find_good_copy()/btrfs_map_block(). */ goto bioc_out; } - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = logical >> 9; bio->bi_private = sblock; bio->bi_end_io = scrub_missing_raid56_end_io; - rbio = raid56_alloc_missing_rbio(bio, bioc, length); + rbio = raid56_alloc_missing_rbio(bio, bioc); if (!rbio) goto rbio_out; - for (i = 0; i < sblock->page_count; i++) { - struct scrub_page *spage = sblock->pagev[i]; + for (i = 0; i < sblock->sector_count; i++) { + struct scrub_sector *sector = sblock->sectors[i]; - raid56_add_scrub_pages(rbio, spage->page, spage->logical); + /* + * For now, our scrub is still one page per sector, so pgoff + * is always 0. + */ + raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical); } - btrfs_init_work(&sblock->work, scrub_missing_raid56_worker, NULL, NULL); + INIT_WORK(&sblock->work, scrub_missing_raid56_worker); scrub_block_get(sblock); scrub_pending_bio_inc(sctx); raid56_submit_missing_rbio(rbio); @@ -2249,7 +2216,7 @@ bioc_out: spin_unlock(&sctx->stat_lock); } -static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, +static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, u64 physical_for_dev_replace) @@ -2273,7 +2240,7 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, sblock->no_io_error_seen = 1; for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; /* * Here we will allocate one page for one sector to scrub. * This is fine if PAGE_SIZE == sectorsize, but will cost @@ -2281,8 +2248,8 @@ static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, */ u32 l = min(sectorsize, len); - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_KERNEL); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2290,26 +2257,26 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); - scrub_page_get(spage); - sblock->pagev[index] = spage; - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->physical_for_dev_replace = physical_for_dev_replace; - spage->mirror_num = mirror_num; + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); + scrub_sector_get(sector); + sblock->sectors[index] = sector; + sector->sblock = sblock; + sector->dev = dev; + sector->flags = flags; + sector->generation = gen; + sector->logical = logical; + sector->physical = physical; + sector->physical_for_dev_replace = physical_for_dev_replace; + sector->mirror_num = mirror_num; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) + sblock->sector_count++; + sector->page = alloc_page(GFP_KERNEL); + if (!sector->page) goto leave_nomem; len -= l; logical += l; @@ -2317,7 +2284,7 @@ leave_nomem: physical_for_dev_replace += l; } - WARN_ON(sblock->page_count == 0); + WARN_ON(sblock->sector_count == 0); if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) { /* * This case should only be hit for RAID 5/6 device replace. See @@ -2325,11 +2292,11 @@ leave_nomem: */ scrub_missing_raid56_pages(sblock); } else { - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; @@ -2353,31 +2320,31 @@ static void scrub_bio_end_io(struct bio *bio) sbio->status = bio->bi_status; sbio->bio = bio; - btrfs_queue_work(fs_info->scrub_workers, &sbio->work); + queue_work(fs_info->scrub_workers, &sbio->work); } -static void scrub_bio_end_io_worker(struct btrfs_work *work) +static void scrub_bio_end_io_worker(struct work_struct *work) { struct scrub_bio *sbio = container_of(work, struct scrub_bio, work); struct scrub_ctx *sctx = sbio->sctx; int i; - ASSERT(sbio->page_count <= SCRUB_PAGES_PER_BIO); + ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO); if (sbio->status) { - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; - spage->io_error = 1; - spage->sblock->no_io_error_seen = 0; + sector->io_error = 1; + sector->sblock->no_io_error_seen = 0; } } - /* now complete the scrub_block items that have all pages completed */ - for (i = 0; i < sbio->page_count; i++) { - struct scrub_page *spage = sbio->pagev[i]; - struct scrub_block *sblock = spage->sblock; + /* Now complete the scrub_block items that have all pages completed */ + for (i = 0; i < sbio->sector_count; i++) { + struct scrub_sector *sector = sbio->sectors[i]; + struct scrub_block *sblock = sector->sblock; - if (atomic_dec_and_test(&sblock->outstanding_pages)) + if (atomic_dec_and_test(&sblock->outstanding_sectors)) scrub_block_complete(sblock); scrub_block_put(sblock); } @@ -2428,13 +2395,13 @@ static inline void __scrub_mark_bitmap(struct scrub_parity *sparity, static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len); } static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity, u64 start, u32 len) { - __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len); + __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len); } static void scrub_block_complete(struct scrub_block *sblock) @@ -2456,8 +2423,8 @@ static void scrub_block_complete(struct scrub_block *sblock) } if (sblock->sparity && corrupted && !sblock->data_corrected) { - u64 start = sblock->pagev[0]->logical; - u64 end = sblock->pagev[sblock->page_count - 1]->logical + + u64 start = sblock->sectors[0]->logical; + u64 end = sblock->sectors[sblock->sector_count - 1]->logical + sblock->sctx->fs_info->sectorsize; ASSERT(end - start <= U32_MAX); @@ -2532,8 +2499,11 @@ static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum) static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, - u64 gen, int mirror_num, u64 physical_for_dev_replace) + u64 gen, int mirror_num) { + struct btrfs_device *src_dev = dev; + u64 src_physical = physical; + int src_mirror = mirror_num; int ret; u8 csum[BTRFS_CSUM_SIZE]; u32 blocksize; @@ -2561,6 +2531,18 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, WARN_ON(1); } + /* + * For dev-replace case, we can have @dev being a missing device. + * Regular scrub will avoid its execution on missing device at all, + * as that would trigger tons of read error. + * + * Reading from missing device will cause read error counts to + * increase unnecessarily. + * So here we change the read source to a good mirror. + */ + if (sctx->is_dev_replace && !dev->bdev) + scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical, + &src_dev, &src_mirror); while (len) { u32 l = min(len, blocksize); int have_csum = 0; @@ -2571,20 +2553,20 @@ static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map, if (have_csum == 0) ++sctx->stat.no_csum; } - ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen, - mirror_num, have_csum ? csum : NULL, - physical_for_dev_replace); + ret = scrub_sectors(sctx, logical, l, src_physical, src_dev, + flags, gen, src_mirror, + have_csum ? csum : NULL, physical); if (ret) return ret; len -= l; logical += l; physical += l; - physical_for_dev_replace += l; + src_physical += l; } return 0; } -static int scrub_pages_for_parity(struct scrub_parity *sparity, +static int scrub_sectors_for_parity(struct scrub_parity *sparity, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum) @@ -2613,10 +2595,10 @@ static int scrub_pages_for_parity(struct scrub_parity *sparity, scrub_parity_get(sparity); for (index = 0; len > 0; index++) { - struct scrub_page *spage; + struct scrub_sector *sector; - spage = kzalloc(sizeof(*spage), GFP_KERNEL); - if (!spage) { + sector = kzalloc(sizeof(*sector), GFP_KERNEL); + if (!sector) { leave_nomem: spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2624,29 +2606,29 @@ leave_nomem: scrub_block_put(sblock); return -ENOMEM; } - ASSERT(index < SCRUB_MAX_PAGES_PER_BLOCK); + ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK); /* For scrub block */ - scrub_page_get(spage); - sblock->pagev[index] = spage; + scrub_sector_get(sector); + sblock->sectors[index] = sector; /* For scrub parity */ - scrub_page_get(spage); - list_add_tail(&spage->list, &sparity->spages); - spage->sblock = sblock; - spage->dev = dev; - spage->flags = flags; - spage->generation = gen; - spage->logical = logical; - spage->physical = physical; - spage->mirror_num = mirror_num; + scrub_sector_get(sector); + list_add_tail(§or->list, &sparity->sectors_list); + sector->sblock = sblock; + sector->dev = dev; + sector->flags = flags; + sector->generation = gen; + sector->logical = logical; + sector->physical = physical; + sector->mirror_num = mirror_num; if (csum) { - spage->have_csum = 1; - memcpy(spage->csum, csum, sctx->fs_info->csum_size); + sector->have_csum = 1; + memcpy(sector->csum, csum, sctx->fs_info->csum_size); } else { - spage->have_csum = 0; + sector->have_csum = 0; } - sblock->page_count++; - spage->page = alloc_page(GFP_KERNEL); - if (!spage->page) + sblock->sector_count++; + sector->page = alloc_page(GFP_KERNEL); + if (!sector->page) goto leave_nomem; @@ -2656,19 +2638,19 @@ leave_nomem: physical += sectorsize; } - WARN_ON(sblock->page_count == 0); - for (index = 0; index < sblock->page_count; index++) { - struct scrub_page *spage = sblock->pagev[index]; + WARN_ON(sblock->sector_count == 0); + for (index = 0; index < sblock->sector_count; index++) { + struct scrub_sector *sector = sblock->sectors[index]; int ret; - ret = scrub_add_page_to_rd_bio(sctx, spage); + ret = scrub_add_sector_to_rd_bio(sctx, sector); if (ret) { scrub_block_put(sblock); return ret; } } - /* last one frees, either here or in bio completion for last page */ + /* Last one frees, either here or in bio completion for last sector */ scrub_block_put(sblock); return 0; } @@ -2707,7 +2689,7 @@ static int scrub_extent_for_parity(struct scrub_parity *sparity, if (have_csum == 0) goto skip; } - ret = scrub_pages_for_parity(sparity, logical, l, physical, dev, + ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev, flags, gen, mirror_num, have_csum ? csum : NULL); if (ret) @@ -2767,10 +2749,10 @@ static int get_raid56_logic_offset(u64 physical, int num, static void scrub_free_parity(struct scrub_parity *sparity) { struct scrub_ctx *sctx = sparity->sctx; - struct scrub_page *curr, *next; + struct scrub_sector *curr, *next; int nbits; - nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors); + nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors); if (nbits) { spin_lock(&sctx->stat_lock); sctx->stat.read_errors += nbits; @@ -2778,15 +2760,15 @@ static void scrub_free_parity(struct scrub_parity *sparity) spin_unlock(&sctx->stat_lock); } - list_for_each_entry_safe(curr, next, &sparity->spages, list) { + list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) { list_del_init(&curr->list); - scrub_page_put(curr); + scrub_sector_put(curr); } kfree(sparity); } -static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +static void scrub_parity_bio_endio_worker(struct work_struct *work) { struct scrub_parity *sparity = container_of(work, struct scrub_parity, work); @@ -2798,18 +2780,17 @@ static void scrub_parity_bio_endio_worker(struct btrfs_work *work) static void scrub_parity_bio_endio(struct bio *bio) { - struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; + struct scrub_parity *sparity = bio->bi_private; struct btrfs_fs_info *fs_info = sparity->sctx->fs_info; if (bio->bi_status) - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, - sparity->nsectors); + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, + &sparity->dbitmap, sparity->nsectors); bio_put(bio); - btrfs_init_work(&sparity->work, scrub_parity_bio_endio_worker, NULL, - NULL); - btrfs_queue_work(fs_info->scrub_parity_workers, &sparity->work); + INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker); + queue_work(fs_info->scrub_parity_workers, &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -2822,8 +2803,8 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) u64 length; int ret; - if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap, - sparity->nsectors)) + if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap, + &sparity->ebitmap, sparity->nsectors)) goto out; length = sparity->logic_end - sparity->logic_start; @@ -2834,14 +2815,14 @@ static void scrub_parity_check_and_repair(struct scrub_parity *sparity) if (ret || !bioc || !bioc->raid_map) goto bioc_out; - bio = btrfs_bio_alloc(BIO_MAX_VECS); + bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS); bio->bi_iter.bi_sector = sparity->logic_start >> 9; bio->bi_private = sparity; bio->bi_end_io = scrub_parity_bio_endio; - rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length, + rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, sparity->scrub_dev, - sparity->dbitmap, + &sparity->dbitmap, sparity->nsectors); if (!rbio) goto rbio_out; @@ -2855,7 +2836,7 @@ rbio_out: bioc_out: btrfs_bio_counter_dec(fs_info); btrfs_put_bioc(bioc); - bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, + bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap, sparity->nsectors); spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2864,11 +2845,6 @@ out: scrub_free_parity(sparity); } -static inline int scrub_calc_parity_bitmap_len(int nsectors) -{ - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); -} - static void scrub_parity_get(struct scrub_parity *sparity) { refcount_inc(&sparity->refs); @@ -2882,6 +2858,251 @@ static void scrub_parity_put(struct scrub_parity *sparity) scrub_parity_check_and_repair(sparity); } +/* + * Return 0 if the extent item range covers any byte of the range. + * Return <0 if the extent item is before @search_start. + * Return >0 if the extent item is after @start_start + @search_len. + */ +static int compare_extent_item_range(struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info; + u64 len; + struct btrfs_key key; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY); + if (key.type == BTRFS_METADATA_ITEM_KEY) + len = fs_info->nodesize; + else + len = key.offset; + + if (key.objectid + len <= search_start) + return -1; + if (key.objectid >= search_start + search_len) + return 1; + return 0; +} + +/* + * Locate one extent item which covers any byte in range + * [@search_start, @search_start + @search_length) + * + * If the path is not initialized, we will initialize the search by doing + * a btrfs_search_slot(). + * If the path is already initialized, we will use the path as the initial + * slot, to avoid duplicated btrfs_search_slot() calls. + * + * NOTE: If an extent item starts before @search_start, we will still + * return the extent item. This is for data extent crossing stripe boundary. + * + * Return 0 if we found such extent item, and @path will point to the extent item. + * Return >0 if no such extent item can be found, and @path will be released. + * Return <0 if hit fatal error, and @path will be released. + */ +static int find_first_extent_item(struct btrfs_root *extent_root, + struct btrfs_path *path, + u64 search_start, u64 search_len) +{ + struct btrfs_fs_info *fs_info = extent_root->fs_info; + struct btrfs_key key; + int ret; + + /* Continue using the existing path */ + if (path->nodes[0]) + goto search_forward; + + if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) + key.type = BTRFS_METADATA_ITEM_KEY; + else + key.type = BTRFS_EXTENT_ITEM_KEY; + key.objectid = search_start; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + return ret; + + ASSERT(ret > 0); + /* + * Here we intentionally pass 0 as @min_objectid, as there could be + * an extent item starting before @search_start. + */ + ret = btrfs_previous_extent_item(extent_root, path, 0); + if (ret < 0) + return ret; + /* + * No matter whether we have found an extent item, the next loop will + * properly do every check on the key. + */ +search_forward: + while (true) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid >= search_start + search_len) + break; + if (key.type != BTRFS_METADATA_ITEM_KEY && + key.type != BTRFS_EXTENT_ITEM_KEY) + goto next; + + ret = compare_extent_item_range(path, search_start, search_len); + if (ret == 0) + return ret; + if (ret > 0) + break; +next: + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(extent_root, path); + if (ret) { + /* Either no more item or fatal error */ + btrfs_release_path(path); + return ret; + } + } + } + btrfs_release_path(path); + return 1; +} + +static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret, + u64 *size_ret, u64 *flags_ret, u64 *generation_ret) +{ + struct btrfs_key key; + struct btrfs_extent_item *ei; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + ASSERT(key.type == BTRFS_METADATA_ITEM_KEY || + key.type == BTRFS_EXTENT_ITEM_KEY); + *extent_start_ret = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + *size_ret = path->nodes[0]->fs_info->nodesize; + else + *size_ret = key.offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); + *flags_ret = btrfs_extent_flags(path->nodes[0], ei); + *generation_ret = btrfs_extent_generation(path->nodes[0], ei); +} + +static bool does_range_cross_boundary(u64 extent_start, u64 extent_len, + u64 boundary_start, u64 boudary_len) +{ + return (extent_start < boundary_start && + extent_start + extent_len > boundary_start) || + (extent_start < boundary_start + boudary_len && + extent_start + extent_len > boundary_start + boudary_len); +} + +static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx, + struct scrub_parity *sparity, + struct map_lookup *map, + struct btrfs_device *sdev, + struct btrfs_path *path, + u64 logical) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical); + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical); + u64 cur_logical = logical; + int ret; + + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + + /* Path must not be populated */ + ASSERT(!path->nodes[0]); + + while (cur_logical < logical + map->stripe_len) { + struct btrfs_io_context *bioc = NULL; + struct btrfs_device *extent_dev; + u64 extent_start; + u64 extent_size; + u64 mapped_length; + u64 extent_flags; + u64 extent_gen; + u64 extent_physical; + u64 extent_mirror_num; + + ret = find_first_extent_item(extent_root, path, cur_logical, + logical + map->stripe_len - cur_logical); + /* No more extent item in this data stripe */ + if (ret > 0) { + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(path, &extent_start, &extent_size, &extent_flags, + &extent_gen); + + /* Metadata should not cross stripe boundaries */ + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_size, + logical, map->stripe_len)) { + btrfs_err(fs_info, + "scrub: tree block %llu spanning stripes, ignored. logical=%llu", + extent_start, logical); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += extent_size; + continue; + } + + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* Truncate the range inside this data stripe */ + extent_size = min(extent_start + extent_size, + logical + map->stripe_len) - cur_logical; + extent_start = cur_logical; + ASSERT(extent_size <= U32_MAX); + + scrub_parity_mark_sectors_data(sparity, extent_start, extent_size); + + mapped_length = extent_size; + ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start, + &mapped_length, &bioc, 0); + if (!ret && (!bioc || mapped_length < extent_size)) + ret = -EIO; + if (ret) { + btrfs_put_bioc(bioc); + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + extent_physical = bioc->stripes[0].physical; + extent_mirror_num = bioc->mirror_num; + extent_dev = bioc->stripes[0].dev; + btrfs_put_bioc(bioc); + + ret = btrfs_lookup_csums_range(csum_root, extent_start, + extent_start + extent_size - 1, + &sctx->csum_list, 1); + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + ret = scrub_extent_for_parity(sparity, extent_start, + extent_size, extent_physical, + extent_dev, extent_flags, + extent_gen, extent_mirror_num); + scrub_free_csums(sctx); + + if (ret) { + scrub_parity_mark_sectors_error(sparity, extent_start, + extent_size); + break; + } + + cond_resched(); + cur_logical += extent_size; + } + btrfs_release_path(path); + return ret; +} + static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, struct map_lookup *map, struct btrfs_device *sdev, @@ -2889,28 +3110,11 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, u64 logic_end) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct btrfs_root *root = btrfs_extent_root(fs_info, logic_start); - struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; - struct btrfs_io_context *bioc = NULL; struct btrfs_path *path; - u64 flags; + u64 cur_logical; int ret; - int slot; - struct extent_buffer *l; - struct btrfs_key key; - u64 generation; - u64 extent_logical; - u64 extent_physical; - /* Check the comment in scrub_stripe() for why u32 is enough here */ - u32 extent_len; - u64 mapped_length; - struct btrfs_device *extent_dev; struct scrub_parity *sparity; int nsectors; - int bitmap_len; - int extent_mirror_num; - int stop_loop = 0; path = btrfs_alloc_path(); if (!path) { @@ -2924,9 +3128,8 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, ASSERT(map->stripe_len <= U32_MAX); nsectors = map->stripe_len >> fs_info->sectorsize_bits; - bitmap_len = scrub_calc_parity_bitmap_len(nsectors); - sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len, - GFP_NOFS); + ASSERT(nsectors <= BITS_PER_LONG); + sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS); if (!sparity) { spin_lock(&sctx->stat_lock); sctx->stat.malloc_errors++; @@ -2943,178 +3146,17 @@ static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx, sparity->logic_start = logic_start; sparity->logic_end = logic_end; refcount_set(&sparity->refs, 1); - INIT_LIST_HEAD(&sparity->spages); - sparity->dbitmap = sparity->bitmap; - sparity->ebitmap = (void *)sparity->bitmap + bitmap_len; + INIT_LIST_HEAD(&sparity->sectors_list); ret = 0; - while (logic_start < logic_end) { - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logic_start; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + for (cur_logical = logic_start; cur_logical < logic_end; + cur_logical += map->stripe_len) { + ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map, + sdev, path, cur_logical); if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logic_start) - goto next; - - if (key.objectid >= logic_end) { - stop_loop = 1; - break; - } - - while (key.objectid >= logic_start + map->stripe_len) - logic_start += map->stripe_len; - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logic_start || - key.objectid + bytes > - logic_start + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logic_start); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - if (extent_logical < logic_start) { - extent_len -= logic_start - extent_logical; - extent_logical = logic_start; - } - - if (extent_logical + extent_len > - logic_start + map->stripe_len) - extent_len = logic_start + map->stripe_len - - extent_logical; - - scrub_parity_mark_sectors_data(sparity, extent_logical, - extent_len); - - mapped_length = extent_len; - bioc = NULL; - ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, - extent_logical, &mapped_length, &bioc, - 0); - if (!ret) { - if (!bioc || mapped_length < extent_len) - ret = -EIO; - } - if (ret) { - btrfs_put_bioc(bioc); - goto out; - } - extent_physical = bioc->stripes[0].physical; - extent_mirror_num = bioc->mirror_num; - extent_dev = bioc->stripes[0].dev; - btrfs_put_bioc(bioc); - - csum_root = btrfs_csum_root(fs_info, extent_logical); - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - - ret = scrub_extent_for_parity(sparity, extent_logical, - extent_len, - extent_physical, - extent_dev, flags, - generation, - extent_mirror_num); - - scrub_free_csums(sctx); - - if (ret) - goto out; - - if (extent_logical + extent_len < - key.objectid + bytes) { - logic_start += map->stripe_len; - - if (logic_start >= logic_end) { - stop_loop = 1; - break; - } - - if (logic_start < key.objectid + bytes) { - cond_resched(); - goto again; - } - } -next: - path->slots[0]++; - } - - btrfs_release_path(path); - - if (stop_loop) break; - - logic_start += map->stripe_len; - } -out: - if (ret < 0) { - ASSERT(logic_end - logic_start <= U32_MAX); - scrub_parity_mark_sectors_error(sparity, logic_start, - logic_end - logic_start); } + scrub_parity_put(sparity); scrub_submit(sctx); mutex_lock(&sctx->wr_lock); @@ -3165,69 +3207,234 @@ static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical, return ret; } +/* + * Scrub one range which can only has simple mirror based profile. + * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in + * RAID0/RAID10). + * + * Since we may need to handle a subset of block group, we need @logical_start + * and @logical_length parameter. + */ +static int scrub_simple_mirror(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + u64 logical_start, u64 logical_length, + struct btrfs_device *device, + u64 physical, int mirror_num) +{ + struct btrfs_fs_info *fs_info = sctx->fs_info; + const u64 logical_end = logical_start + logical_length; + /* An artificial limit, inherit from old scrub behavior */ + const u32 max_length = SZ_64K; + struct btrfs_path path = { 0 }; + u64 cur_logical = logical_start; + int ret; + + /* The range must be inside the bg */ + ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length); + + path.search_commit_root = 1; + path.skip_locking = 1; + /* Go through each extent items inside the logical range */ + while (cur_logical < logical_end) { + u64 extent_start; + u64 extent_len; + u64 extent_flags; + u64 extent_gen; + u64 scrub_len; + + /* Canceled? */ + if (atomic_read(&fs_info->scrub_cancel_req) || + atomic_read(&sctx->cancel_req)) { + ret = -ECANCELED; + break; + } + /* Paused? */ + if (atomic_read(&fs_info->scrub_pause_req)) { + /* Push queued extents */ + sctx->flush_all_writes = true; + scrub_submit(sctx); + mutex_lock(&sctx->wr_lock); + scrub_wr_submit(sctx); + mutex_unlock(&sctx->wr_lock); + wait_event(sctx->list_wait, + atomic_read(&sctx->bios_in_flight) == 0); + sctx->flush_all_writes = false; + scrub_blocked_if_needed(fs_info); + } + /* Block group removed? */ + spin_lock(&bg->lock); + if (bg->removed) { + spin_unlock(&bg->lock); + ret = 0; + break; + } + spin_unlock(&bg->lock); + + ret = find_first_extent_item(extent_root, &path, cur_logical, + logical_end - cur_logical); + if (ret > 0) { + /* No more extent, just update the accounting */ + sctx->stat.last_physical = physical + logical_length; + ret = 0; + break; + } + if (ret < 0) + break; + get_extent_info(&path, &extent_start, &extent_len, + &extent_flags, &extent_gen); + /* Skip hole range which doesn't have any extent */ + cur_logical = max(extent_start, cur_logical); + + /* + * Scrub len has three limits: + * - Extent size limit + * - Scrub range limit + * This is especially imporatant for RAID0/RAID10 to reuse + * this function + * - Max scrub size limit + */ + scrub_len = min(min(extent_start + extent_len, + logical_end), cur_logical + max_length) - + cur_logical; + + if (extent_flags & BTRFS_EXTENT_FLAG_DATA) { + ret = btrfs_lookup_csums_range(csum_root, cur_logical, + cur_logical + scrub_len - 1, + &sctx->csum_list, 1); + if (ret) + break; + } + if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && + does_range_cross_boundary(extent_start, extent_len, + logical_start, logical_length)) { + btrfs_err(fs_info, +"scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)", + extent_start, logical_start, logical_end); + spin_lock(&sctx->stat_lock); + sctx->stat.uncorrectable_errors++; + spin_unlock(&sctx->stat_lock); + cur_logical += scrub_len; + continue; + } + ret = scrub_extent(sctx, map, cur_logical, scrub_len, + cur_logical - logical_start + physical, + device, extent_flags, extent_gen, + mirror_num); + scrub_free_csums(sctx); + if (ret) + break; + if (sctx->is_dev_replace) + sync_replace_for_zoned(sctx); + cur_logical += scrub_len; + /* Don't hold CPU for too long time */ + cond_resched(); + } + btrfs_release_path(&path); + return ret; +} + +/* Calculate the full stripe length for simple stripe based profiles */ +static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + + return map->num_stripes / map->sub_stripes * map->stripe_len; +} + +/* Get the logical bytenr for the stripe */ +static u64 simple_stripe_get_logical(struct map_lookup *map, + struct btrfs_block_group *bg, + int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* + * (stripe_index / sub_stripes) gives how many data stripes we need to + * skip. + */ + return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start; +} + +/* Get the mirror number for the stripe */ +static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +{ + ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID10)); + ASSERT(stripe_index < map->num_stripes); + + /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */ + return stripe_index % map->sub_stripes + 1; +} + +static int scrub_simple_stripe(struct scrub_ctx *sctx, + struct btrfs_root *extent_root, + struct btrfs_root *csum_root, + struct btrfs_block_group *bg, + struct map_lookup *map, + struct btrfs_device *device, + int stripe_index) +{ + const u64 logical_increment = simple_stripe_full_stripe_len(map); + const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index); + const u64 orig_physical = map->stripes[stripe_index].physical; + const int mirror_num = simple_stripe_mirror_num(map, stripe_index); + u64 cur_logical = orig_logical; + u64 cur_physical = orig_physical; + int ret = 0; + + while (cur_logical < bg->start + bg->length) { + /* + * Inside each stripe, RAID0 is just SINGLE, and RAID10 is + * just RAID1, so we can reuse scrub_simple_mirror() to scrub + * this stripe. + */ + ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map, + cur_logical, map->stripe_len, device, + cur_physical, mirror_num); + if (ret) + return ret; + /* Skip to next stripe which belongs to the target device */ + cur_logical += logical_increment; + /* For physical offset, we just go to next stripe */ + cur_physical += map->stripe_len; + } + return ret; +} + static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct extent_map *em, struct btrfs_device *scrub_dev, - int stripe_index, u64 dev_extent_len) + int stripe_index) { struct btrfs_path *path; struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_root *root; struct btrfs_root *csum_root; - struct btrfs_extent_item *extent; struct blk_plug plug; + struct map_lookup *map = em->map_lookup; + const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; - u64 flags; int ret; - int slot; - u64 nstripes; - struct extent_buffer *l; - u64 physical; + u64 physical = map->stripes[stripe_index].physical; + const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; - u64 physical_end; - u64 generation; - int mirror_num; - struct btrfs_key key; + /* The logical increment after finishing one stripe */ u64 increment; + /* Offset inside the chunk */ u64 offset; - u64 extent_logical; - u64 extent_physical; - /* - * Unlike chunk length, extent length should never go beyond - * BTRFS_MAX_EXTENT_SIZE, thus u32 is enough here. - */ - u32 extent_len; u64 stripe_logical; u64 stripe_end; - struct btrfs_device *extent_dev; - int extent_mirror_num; int stop_loop = 0; - physical = map->stripes[stripe_index].physical; - offset = 0; - nstripes = div64_u64(dev_extent_len, map->stripe_len); - mirror_num = 1; - increment = map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - offset = map->stripe_len * stripe_index; - increment = map->stripe_len * map->num_stripes; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - int factor = map->num_stripes / map->sub_stripes; - offset = map->stripe_len * (stripe_index / map->sub_stripes); - increment = map->stripe_len * factor; - mirror_num = stripe_index % map->sub_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - mirror_num = stripe_index % map->num_stripes + 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical, stripe_index, map, &offset, - NULL); - increment = map->stripe_len * nr_data_stripes(map); - } - path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -3241,21 +3448,12 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, path->skip_locking = 1; path->reada = READA_FORWARD; - logical = chunk_logical + offset; - physical_end = physical + nstripes * map->stripe_len; - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - get_raid56_logic_offset(physical_end, stripe_index, - map, &logic_end, NULL); - logic_end += chunk_logical; - } else { - logic_end = logical + increment * nstripes; - } wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0); scrub_blocked_if_needed(fs_info); - root = btrfs_extent_root(fs_info, logical); - csum_root = btrfs_csum_root(fs_info, logical); + root = btrfs_extent_root(fs_info, bg->start); + csum_root = btrfs_csum_root(fs_info, bg->start); /* * collect all data csums for the stripe to avoid seeking during @@ -3272,247 +3470,89 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, } /* - * now find all extents for each stripe and scrub them + * There used to be a big double loop to handle all profiles using the + * same routine, which grows larger and more gross over time. + * + * So here we handle each profile differently, so simpler profiles + * have simpler scrubbing function. */ - ret = 0; - while (physical < physical_end) { + if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID56_MASK))) { /* - * canceled? - */ - if (atomic_read(&fs_info->scrub_cancel_req) || - atomic_read(&sctx->cancel_req)) { - ret = -ECANCELED; - goto out; - } - /* - * check to see if we have to pause + * Above check rules out all complex profile, the remaining + * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple + * mirrored duplication without stripe. + * + * Only @physical and @mirror_num needs to calculated using + * @stripe_index. */ - if (atomic_read(&fs_info->scrub_pause_req)) { - /* push queued extents */ - sctx->flush_all_writes = true; - scrub_submit(sctx); - mutex_lock(&sctx->wr_lock); - scrub_wr_submit(sctx); - mutex_unlock(&sctx->wr_lock); - wait_event(sctx->list_wait, - atomic_read(&sctx->bios_in_flight) == 0); - sctx->flush_all_writes = false; - scrub_blocked_if_needed(fs_info); - } - - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - ret = get_raid56_logic_offset(physical, stripe_index, - map, &logical, - &stripe_logical); - logical += chunk_logical; - if (ret) { - /* it is parity strip */ - stripe_logical += chunk_logical; - stripe_end = stripe_logical + increment; - ret = scrub_raid56_parity(sctx, map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto skip; - } - } - - if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) - key.type = BTRFS_METADATA_ITEM_KEY; - else - key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; - key.offset = (u64)-1; - - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - if (ret > 0) { - ret = btrfs_previous_extent_item(root, path, 0); - if (ret < 0) - goto out; - if (ret > 0) { - /* there's no smaller item, so stick with the - * larger one */ - btrfs_release_path(path); - ret = btrfs_search_slot(NULL, root, &key, - path, 0, 0); - if (ret < 0) - goto out; - } - } - - stop_loop = 0; - while (1) { - u64 bytes; - - l = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(l)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto out; - - stop_loop = 1; - break; - } - btrfs_item_key_to_cpu(l, &key, slot); - - if (key.type != BTRFS_EXTENT_ITEM_KEY && - key.type != BTRFS_METADATA_ITEM_KEY) - goto next; - - if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = fs_info->nodesize; - else - bytes = key.offset; - - if (key.objectid + bytes <= logical) - goto next; - - if (key.objectid >= logical + map->stripe_len) { - /* out of this device extent */ - if (key.objectid >= logic_end) - stop_loop = 1; - break; - } - - /* - * If our block group was removed in the meanwhile, just - * stop scrubbing since there is no point in continuing. - * Continuing would prevent reusing its device extents - * for new block groups for a long time. - */ - spin_lock(&bg->lock); - if (bg->removed) { - spin_unlock(&bg->lock); - ret = 0; - goto out; - } - spin_unlock(&bg->lock); - - extent = btrfs_item_ptr(l, slot, - struct btrfs_extent_item); - flags = btrfs_extent_flags(l, extent); - generation = btrfs_extent_generation(l, extent); - - if ((flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) && - (key.objectid < logical || - key.objectid + bytes > - logical + map->stripe_len)) { - btrfs_err(fs_info, - "scrub: tree block %llu spanning stripes, ignored. logical=%llu", - key.objectid, logical); - spin_lock(&sctx->stat_lock); - sctx->stat.uncorrectable_errors++; - spin_unlock(&sctx->stat_lock); - goto next; - } - -again: - extent_logical = key.objectid; - ASSERT(bytes <= U32_MAX); - extent_len = bytes; - - /* - * trim extent to this stripe - */ - if (extent_logical < logical) { - extent_len -= logical - extent_logical; - extent_logical = logical; - } - if (extent_logical + extent_len > - logical + map->stripe_len) { - extent_len = logical + map->stripe_len - - extent_logical; - } + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + bg->start, bg->length, scrub_dev, + map->stripes[stripe_index].physical, + stripe_index + 1); + offset = 0; + goto out; + } + if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { + ret = scrub_simple_stripe(sctx, root, csum_root, bg, map, + scrub_dev, stripe_index); + offset = map->stripe_len * (stripe_index / map->sub_stripes); + goto out; + } - extent_physical = extent_logical - logical + physical; - extent_dev = scrub_dev; - extent_mirror_num = mirror_num; - if (sctx->is_dev_replace) - scrub_remap_extent(fs_info, extent_logical, - extent_len, &extent_physical, - &extent_dev, - &extent_mirror_num); - - if (flags & BTRFS_EXTENT_FLAG_DATA) { - ret = btrfs_lookup_csums_range(csum_root, - extent_logical, - extent_logical + extent_len - 1, - &sctx->csum_list, 1); - if (ret) - goto out; - } + /* Only RAID56 goes through the old code */ + ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK); + ret = 0; - ret = scrub_extent(sctx, map, extent_logical, extent_len, - extent_physical, extent_dev, flags, - generation, extent_mirror_num, - extent_logical - logical + physical); + /* Calculate the logical end of the stripe */ + get_raid56_logic_offset(physical_end, stripe_index, + map, &logic_end, NULL); + logic_end += chunk_logical; - scrub_free_csums(sctx); + /* Initialize @offset in case we need to go to out: label */ + get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL); + increment = map->stripe_len * nr_data_stripes(map); + /* + * Due to the rotation, for RAID56 it's better to iterate each stripe + * using their physical offset. + */ + while (physical < physical_end) { + ret = get_raid56_logic_offset(physical, stripe_index, map, + &logical, &stripe_logical); + logical += chunk_logical; + if (ret) { + /* it is parity strip */ + stripe_logical += chunk_logical; + stripe_end = stripe_logical + increment; + ret = scrub_raid56_parity(sctx, map, scrub_dev, + stripe_logical, + stripe_end); if (ret) goto out; + goto next; + } - if (sctx->is_dev_replace) - sync_replace_for_zoned(sctx); - - if (extent_logical + extent_len < - key.objectid + bytes) { - if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - /* - * loop until we find next data stripe - * or we have finished all stripes. - */ -loop: - physical += map->stripe_len; - ret = get_raid56_logic_offset(physical, - stripe_index, map, - &logical, &stripe_logical); - logical += chunk_logical; - - if (ret && physical < physical_end) { - stripe_logical += chunk_logical; - stripe_end = stripe_logical + - increment; - ret = scrub_raid56_parity(sctx, - map, scrub_dev, - stripe_logical, - stripe_end); - if (ret) - goto out; - goto loop; - } - } else { - physical += map->stripe_len; - logical += increment; - } - if (logical < key.objectid + bytes) { - cond_resched(); - goto again; - } - - if (physical >= physical_end) { - stop_loop = 1; - break; - } - } + /* + * Now we're at a data stripe, scrub each extents in the range. + * + * At this stage, if we ignore the repair part, inside each data + * stripe it is no different than SINGLE profile. + * We can reuse scrub_simple_mirror() here, as the repair part + * is still based on @mirror_num. + */ + ret = scrub_simple_mirror(sctx, root, csum_root, bg, map, + logical, map->stripe_len, + scrub_dev, physical, 1); + if (ret < 0) + goto out; next: - path->slots[0]++; - } - btrfs_release_path(path); -skip: logical += increment; physical += map->stripe_len; spin_lock(&sctx->stat_lock); if (stop_loop) - sctx->stat.last_physical = map->stripes[stripe_index].physical + - dev_extent_len; + sctx->stat.last_physical = + map->stripes[stripe_index].physical + dev_stripe_len; else sctx->stat.last_physical = physical; spin_unlock(&sctx->stat_lock); @@ -3581,8 +3621,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, map, scrub_dev, i, - dev_extent_len); + ret = scrub_stripe(sctx, bg, em, scrub_dev, i); if (ret) goto out; } @@ -3964,9 +4003,9 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (!btrfs_check_super_location(scrub_dev, bytenr)) continue; - ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, - scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, - NULL, bytenr); + ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, + scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i, + NULL, bytenr); if (ret) return ret; } @@ -3979,22 +4018,23 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) { if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt, &fs_info->scrub_lock)) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; - - scrub_workers = fs_info->scrub_workers; - scrub_wr_comp = fs_info->scrub_wr_completion_workers; - scrub_parity = fs_info->scrub_parity_workers; + struct workqueue_struct *scrub_workers = fs_info->scrub_workers; + struct workqueue_struct *scrub_wr_comp = + fs_info->scrub_wr_completion_workers; + struct workqueue_struct *scrub_parity = + fs_info->scrub_parity_workers; fs_info->scrub_workers = NULL; fs_info->scrub_wr_completion_workers = NULL; fs_info->scrub_parity_workers = NULL; mutex_unlock(&fs_info->scrub_lock); - btrfs_destroy_workqueue(scrub_workers); - btrfs_destroy_workqueue(scrub_wr_comp); - btrfs_destroy_workqueue(scrub_parity); + if (scrub_workers) + destroy_workqueue(scrub_workers); + if (scrub_wr_comp) + destroy_workqueue(scrub_wr_comp); + if (scrub_parity) + destroy_workqueue(scrub_parity); } } @@ -4004,9 +4044,9 @@ static void scrub_workers_put(struct btrfs_fs_info *fs_info) static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - struct btrfs_workqueue *scrub_workers = NULL; - struct btrfs_workqueue *scrub_wr_comp = NULL; - struct btrfs_workqueue *scrub_parity = NULL; + struct workqueue_struct *scrub_workers = NULL; + struct workqueue_struct *scrub_wr_comp = NULL; + struct workqueue_struct *scrub_parity = NULL; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; int ret = -ENOMEM; @@ -4014,18 +4054,16 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt)) return 0; - scrub_workers = btrfs_alloc_workqueue(fs_info, "scrub", flags, - is_dev_replace ? 1 : max_active, 4); + scrub_workers = alloc_workqueue("btrfs-scrub", flags, + is_dev_replace ? 1 : max_active); if (!scrub_workers) goto fail_scrub_workers; - scrub_wr_comp = btrfs_alloc_workqueue(fs_info, "scrubwrc", flags, - max_active, 2); + scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active); if (!scrub_wr_comp) goto fail_scrub_wr_completion_workers; - scrub_parity = btrfs_alloc_workqueue(fs_info, "scrubparity", flags, - max_active, 2); + scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active); if (!scrub_parity) goto fail_scrub_parity_workers; @@ -4046,11 +4084,11 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->scrub_lock); ret = 0; - btrfs_destroy_workqueue(scrub_parity); + destroy_workqueue(scrub_parity); fail_scrub_parity_workers: - btrfs_destroy_workqueue(scrub_wr_comp); + destroy_workqueue(scrub_wr_comp); fail_scrub_wr_completion_workers: - btrfs_destroy_workqueue(scrub_workers); + destroy_workqueue(scrub_workers); fail_scrub_workers: return ret; } @@ -4082,18 +4120,16 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, } if (fs_info->nodesize > - PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK || - fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) { + SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits || + fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) { /* - * would exhaust the array bounds of pagev member in + * Would exhaust the array bounds of sectorv member in * struct scrub_block */ btrfs_err(fs_info, - "scrub: size assumption nodesize and sectorsize <= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails", - fs_info->nodesize, - SCRUB_MAX_PAGES_PER_BLOCK, - fs_info->sectorsize, - SCRUB_MAX_PAGES_PER_BLOCK); +"scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails", + fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK, + fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK); return -EINVAL; } @@ -4161,7 +4197,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, /* * In order to avoid deadlock with reclaim when there is a transaction * trying to pause scrub, make sure we use GFP_NOFS for all the - * allocations done at btrfs_scrub_pages() and scrub_pages_for_parity() + * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity() * invoked by our callees. The pausing request is done when the * transaction commit starts, and it blocks the transaction until scrub * is paused (done at specific points at scrub_stripe() or right above @@ -4295,11 +4331,11 @@ int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid, return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV; } -static void scrub_remap_extent(struct btrfs_fs_info *fs_info, - u64 extent_logical, u32 extent_len, - u64 *extent_physical, - struct btrfs_device **extent_dev, - int *extent_mirror_num) +static void scrub_find_good_copy(struct btrfs_fs_info *fs_info, + u64 extent_logical, u32 extent_len, + u64 *extent_physical, + struct btrfs_device **extent_dev, + int *extent_mirror_num) { u64 mapped_length; struct btrfs_io_context *bioc = NULL; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7d1642937274..e7671afcee4f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -17,6 +17,7 @@ #include <linux/crc32c.h> #include "send.h" +#include "ctree.h" #include "backref.h" #include "locking.h" #include "disk-io.h" @@ -82,8 +83,12 @@ struct send_ctx { char *send_buf; u32 send_size; u32 send_max_size; - u64 total_send_size; - u64 cmd_send_size[BTRFS_SEND_C_MAX + 1]; + /* + * Whether BTRFS_SEND_A_DATA attribute was already added to current + * command (since protocol v2, data must be the last attribute). + */ + bool put_data; + struct page **send_buf_pages; u64 flags; /* 'flags' member of btrfs_ioctl_send_args is u64 */ /* Protocol version compatibility requested */ u32 proto; @@ -113,14 +118,14 @@ struct send_ctx { */ u64 cur_ino; u64 cur_inode_gen; - int cur_inode_new; - int cur_inode_new_gen; - int cur_inode_deleted; u64 cur_inode_size; u64 cur_inode_mode; u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + bool cur_inode_new; + bool cur_inode_new_gen; + bool cur_inode_deleted; bool ignore_cur_inode; u64 send_progress; @@ -132,7 +137,14 @@ struct send_ctx { struct list_head name_cache_list; int name_cache_size; + /* + * The inode we are currently processing. It's not NULL only when we + * need to issue write commands for data extents from this inode. + */ + struct inode *cur_inode; struct file_ra_state ra; + u64 page_cache_clear_start; + bool clean_page_cache; /* * We process inodes by their increasing order, so if before an @@ -228,6 +240,9 @@ struct send_ctx { * Indexed by the inode number of the directory to be deleted. */ struct rb_root orphan_dirs; + + struct rb_root rbtree_new_refs; + struct rb_root rbtree_deleted_refs; }; struct pending_dir_move { @@ -328,8 +343,8 @@ __maybe_unused static bool proto_cmd_ok(const struct send_ctx *sctx, int cmd) { switch (sctx->proto) { - case 1: return cmd < __BTRFS_SEND_C_MAX_V1; - case 2: return cmd < __BTRFS_SEND_C_MAX_V2; + case 1: return cmd <= BTRFS_SEND_C_MAX_V1; + case 2: return cmd <= BTRFS_SEND_C_MAX_V2; default: return false; } } @@ -570,15 +585,10 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) while (pos < len) { ret = kernel_write(filp, buf + pos, len - pos, off); - /* TODO handle that correctly */ - /*if (ret == -ERESTARTSYS) { - continue; - }*/ if (ret < 0) return ret; - if (ret == 0) { + if (ret == 0) return -EIO; - } pos += ret; } @@ -591,6 +601,9 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) int total_len = sizeof(*hdr) + len; int left = sctx->send_max_size - sctx->send_size; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + if (unlikely(left < total_len)) return -EOVERFLOW; @@ -611,6 +624,7 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return tlv_put(sctx, attr, &__tmp, sizeof(__tmp)); \ } +TLV_PUT_DEFINE_INT(32) TLV_PUT_DEFINE_INT(64) static int tlv_put_string(struct send_ctx *sctx, u16 attr, @@ -686,8 +700,7 @@ static int send_header(struct send_ctx *sctx) struct btrfs_stream_header hdr; strcpy(hdr.magic, BTRFS_SEND_STREAM_MAGIC); - hdr.version = cpu_to_le32(BTRFS_SEND_STREAM_VERSION); - + hdr.version = cpu_to_le32(sctx->proto); return write_buf(sctx->send_filp, &hdr, sizeof(hdr), &sctx->send_off); } @@ -727,9 +740,8 @@ static int send_cmd(struct send_ctx *sctx) ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); - sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; + sctx->put_data = false; return ret; } @@ -835,7 +847,7 @@ out: */ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, - u64 *gid, u64 *rdev) + u64 *gid, u64 *rdev, u64 *fileattr) { int ret; struct btrfs_inode_item *ii; @@ -865,6 +877,12 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, *gid = btrfs_inode_gid(path->nodes[0], ii); if (rdev) *rdev = btrfs_inode_rdev(path->nodes[0], ii); + /* + * Transfer the unchanged u64 value of btrfs_inode_item::flags, that's + * otherwise logically split to 32/32 parts. + */ + if (fileattr) + *fileattr = btrfs_inode_flags(path->nodes[0], ii); return ret; } @@ -872,7 +890,7 @@ static int __get_inode_info(struct btrfs_root *root, struct btrfs_path *path, static int get_inode_info(struct btrfs_root *root, u64 ino, u64 *size, u64 *gen, u64 *mode, u64 *uid, u64 *gid, - u64 *rdev) + u64 *rdev, u64 *fileattr) { struct btrfs_path *path; int ret; @@ -881,7 +899,7 @@ static int get_inode_info(struct btrfs_root *root, if (!path) return -ENOMEM; ret = __get_inode_info(root, path, ino, size, gen, mode, uid, gid, - rdev); + rdev, fileattr); btrfs_free_path(path); return ret; } @@ -1627,7 +1645,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) u64 right_gen; ret = get_inode_info(sctx->send_root, ino, NULL, &left_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; left_ret = ret; @@ -1636,7 +1654,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen) right_ret = -ENOENT; } else { ret = get_inode_info(sctx->parent_root, ino, NULL, &right_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; right_ret = ret; @@ -1799,7 +1817,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino, if (dir_gen) { ret = get_inode_info(root, parent_dir, NULL, dir_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0) goto out; } @@ -1871,7 +1889,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, */ if (sctx->parent_root && dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1899,7 +1917,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (other_inode > sctx->send_progress || is_waiting_for_move(sctx, other_inode)) { ret = get_inode_info(sctx->parent_root, other_inode, NULL, - who_gen, who_mode, NULL, NULL, NULL); + who_gen, who_mode, NULL, NULL, NULL, NULL); if (ret < 0) goto out; @@ -1938,7 +1956,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, if (dir != BTRFS_FIRST_FREE_OBJECTID) { ret = get_inode_info(sctx->send_root, dir, NULL, &gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret < 0 && ret != -ENOENT) goto out; if (ret) { @@ -1961,7 +1979,7 @@ static int did_overwrite_ref(struct send_ctx *sctx, } ret = get_inode_info(sctx->send_root, ow_inode, NULL, &gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret < 0) goto out; @@ -2177,7 +2195,7 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, /* * If the inode is not existent yet, add the orphan name and return 1. * This should only happen for the parent dir that we determine in - * __record_new_ref + * record_new_ref_if_needed(). */ ret = is_inode_existent(sctx, ino, gen); if (ret < 0) @@ -2492,6 +2510,39 @@ out: return ret; } +static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) +{ + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; + int ret = 0; + struct fs_path *p; + + if (sctx->proto < 2) + return 0; + + btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); + + p = fs_path_alloc(); + if (!p) + return -ENOMEM; + + ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, ino, gen, p); + if (ret < 0) + goto out; + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(p); + return ret; +} + static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; @@ -2571,7 +2622,8 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_CTIME, eb, &ii->ctime); - /* TODO Add otime support when the otime patches get into upstream */ + if (sctx->proto >= 2) + TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_OTIME, eb, &ii->otime); ret = send_cmd(sctx); @@ -2605,7 +2657,7 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) if (ino != sctx->cur_ino) { ret = get_inode_info(sctx->send_root, ino, NULL, &gen, &mode, - NULL, NULL, &rdev); + NULL, NULL, &rdev, NULL); if (ret < 0) goto out; } else { @@ -2675,61 +2727,43 @@ out: static int did_create_dir(struct send_ctx *sctx, u64 dir) { int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; struct btrfs_key found_key; struct btrfs_key di_key; - struct extent_buffer *eb; struct btrfs_dir_item *di; - int slot; path = alloc_path_for_send(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = dir; key.type = BTRFS_DIR_INDEX_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0); - if (ret < 0) - goto out; - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->send_root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } + btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) { + struct extent_buffer *eb = path->nodes[0]; - btrfs_item_key_to_cpu(eb, &found_key, slot); if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } - di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item); + di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item); btrfs_dir_item_key_to_cpu(eb, di, &di_key); if (di_key.type != BTRFS_ROOT_ITEM_KEY && di_key.objectid < sctx->send_progress) { ret = 1; - goto out; + break; } - - path->slots[0]++; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -2762,48 +2796,50 @@ struct recorded_ref { u64 dir; u64 dir_gen; int name_len; + struct rb_node node; + struct rb_root *root; }; -static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +static struct recorded_ref *recorded_ref_alloc(void) { - ref->full_path = path; - ref->name = (char *)kbasename(ref->full_path->start); - ref->name_len = ref->full_path->end - ref->name; + struct recorded_ref *ref; + + ref = kzalloc(sizeof(*ref), GFP_KERNEL); + if (!ref) + return NULL; + RB_CLEAR_NODE(&ref->node); + INIT_LIST_HEAD(&ref->list); + return ref; } -/* - * We need to process new refs before deleted refs, but compare_tree gives us - * everything mixed. So we first record all refs and later process them. - * This function is a helper to record one ref. - */ -static int __record_ref(struct list_head *head, u64 dir, - u64 dir_gen, struct fs_path *path) +static void recorded_ref_free(struct recorded_ref *ref) { - struct recorded_ref *ref; - - ref = kmalloc(sizeof(*ref), GFP_KERNEL); if (!ref) - return -ENOMEM; + return; + if (!RB_EMPTY_NODE(&ref->node)) + rb_erase(&ref->node, ref->root); + list_del(&ref->list); + fs_path_free(ref->full_path); + kfree(ref); +} - ref->dir = dir; - ref->dir_gen = dir_gen; - set_ref_path(ref, path); - list_add_tail(&ref->list, head); - return 0; +static void set_ref_path(struct recorded_ref *ref, struct fs_path *path) +{ + ref->full_path = path; + ref->name = (char *)kbasename(ref->full_path->start); + ref->name_len = ref->full_path->end - ref->name; } static int dup_ref(struct recorded_ref *ref, struct list_head *list) { struct recorded_ref *new; - new = kmalloc(sizeof(*ref), GFP_KERNEL); + new = recorded_ref_alloc(); if (!new) return -ENOMEM; new->dir = ref->dir; new->dir_gen = ref->dir_gen; - new->full_path = NULL; - INIT_LIST_HEAD(&new->list); list_add_tail(&new->list, list); return 0; } @@ -2814,9 +2850,7 @@ static void __free_recorded_refs(struct list_head *head) while (!list_empty(head)) { cur = list_entry(head->next, struct recorded_ref, list); - fs_path_free(cur->full_path); - list_del(&cur->list); - kfree(cur); + recorded_ref_free(cur); } } @@ -2933,6 +2967,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, u64 send_progress) { int ret = 0; + int iter_ret = 0; struct btrfs_root *root = sctx->parent_root; struct btrfs_path *path; struct btrfs_key key; @@ -2959,23 +2994,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, if (odi) key.offset = odi->last_dir_index_offset; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct waiting_dir_move *dm; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, - path->slots[0]); if (found_key.objectid != key.objectid || found_key.type != key.type) break; @@ -3010,8 +3031,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen, ret = 0; goto out; } - - path->slots[0]++; + } + if (iter_ret < 0) { + ret = iter_ret; + goto out; } free_orphan_dir_info(sctx, odi); @@ -3337,7 +3360,7 @@ finish: * The parent inode might have been deleted in the send snapshot */ ret = get_inode_info(sctx->send_root, cur->dir, NULL, - NULL, NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL, NULL); if (ret == -ENOENT) { ret = 0; continue; @@ -3512,11 +3535,11 @@ static int wait_for_dest_dir_move(struct send_ctx *sctx, } ret = get_inode_info(sctx->parent_root, di_key.objectid, NULL, - &left_gen, NULL, NULL, NULL, NULL); + &left_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_inode_info(sctx->send_root, di_key.objectid, NULL, - &right_gen, NULL, NULL, NULL, NULL); + &right_gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) { if (ret == -ENOENT) ret = 0; @@ -3579,7 +3602,7 @@ static int check_ino_in_path(struct btrfs_root *root, } /* - * Check if ino ino1 is an ancestor of inode ino2 in the given root for any + * Check if inode ino1 is an ancestor of inode ino2 in the given root for any * possible path (in case ino2 is not a directory and has multiple hard links). * Return 1 if true, 0 if false and < 0 on error. */ @@ -3591,6 +3614,7 @@ static int is_ancestor(struct btrfs_root *root, { bool free_fs_path = false; int ret = 0; + int iter_ret = 0; struct btrfs_path *path = NULL; struct btrfs_key key; @@ -3611,26 +3635,12 @@ static int is_ancestor(struct btrfs_root *root, key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (true) { + btrfs_for_each_slot(root, &key, &key, path, iter_ret) { struct extent_buffer *leaf = path->nodes[0]; int slot = path->slots[0]; u32 cur_offset = 0; u32 item_size; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != ino2) break; if (key.type != BTRFS_INODE_REF_KEY && @@ -3660,7 +3670,7 @@ static int is_ancestor(struct btrfs_root *root, } ret = get_inode_info(root, parent, NULL, &parent_gen, - NULL, NULL, NULL, NULL); + NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = check_ino_in_path(root, ino1, ino1_gen, @@ -3668,10 +3678,12 @@ static int is_ancestor(struct btrfs_root *root, if (ret) goto out; } - path->slots[0]++; } ret = 0; - out: + if (iter_ret < 0) + ret = iter_ret; + +out: btrfs_free_path(path); if (free_fs_path) fs_path_free(fs_path); @@ -3750,7 +3762,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, ret = get_inode_info(sctx->parent_root, ino, NULL, &parent_ino_gen, NULL, NULL, NULL, - NULL); + NULL, NULL); if (ret < 0) goto out; if (ino_gen == parent_ino_gen) { @@ -4344,185 +4356,169 @@ out: return ret; } -static int record_ref(struct btrfs_root *root, u64 dir, struct fs_path *name, - void *ctx, struct list_head *refs) +static int rbtree_ref_comp(const void *k, const struct rb_node *node) +{ + const struct recorded_ref *data = k; + const struct recorded_ref *ref = rb_entry(node, struct recorded_ref, node); + int result; + + if (data->dir > ref->dir) + return 1; + if (data->dir < ref->dir) + return -1; + if (data->dir_gen > ref->dir_gen) + return 1; + if (data->dir_gen < ref->dir_gen) + return -1; + if (data->name_len > ref->name_len) + return 1; + if (data->name_len < ref->name_len) + return -1; + result = strcmp(data->name, ref->name); + if (result > 0) + return 1; + if (result < 0) + return -1; + return 0; +} + +static bool rbtree_ref_less(struct rb_node *node, const struct rb_node *parent) +{ + const struct recorded_ref *entry = rb_entry(node, struct recorded_ref, node); + + return rbtree_ref_comp(entry, parent) < 0; +} + +static int record_ref_in_tree(struct rb_root *root, struct list_head *refs, + struct fs_path *name, u64 dir, u64 dir_gen, + struct send_ctx *sctx) { int ret = 0; - struct send_ctx *sctx = ctx; - struct fs_path *p; - u64 gen; + struct fs_path *path = NULL; + struct recorded_ref *ref = NULL; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + path = fs_path_alloc(); + if (!path) { + ret = -ENOMEM; + goto out; + } - ret = get_inode_info(root, dir, NULL, &gen, NULL, NULL, - NULL, NULL); - if (ret < 0) + ref = recorded_ref_alloc(); + if (!ref) { + ret = -ENOMEM; goto out; + } - ret = get_cur_path(sctx, dir, gen, p); + ret = get_cur_path(sctx, dir, dir_gen, path); if (ret < 0) goto out; - ret = fs_path_add_path(p, name); + ret = fs_path_add_path(path, name); if (ret < 0) goto out; - ret = __record_ref(refs, dir, gen, p); - + ref->dir = dir; + ref->dir_gen = dir_gen; + set_ref_path(ref, path); + list_add_tail(&ref->list, refs); + rb_add(&ref->node, root, rbtree_ref_less); + ref->root = root; out: - if (ret) - fs_path_free(p); + if (ret) { + if (path && (!ref || !ref->full_path)) + fs_path_free(path); + recorded_ref_free(ref); + } return ret; } -static int __record_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - struct send_ctx *sctx = ctx; - return record_ref(sctx->send_root, dir, name, ctx, &sctx->new_refs); -} - - -static int __record_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_new_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { + int ret = 0; struct send_ctx *sctx = ctx; - return record_ref(sctx->parent_root, dir, name, ctx, - &sctx->deleted_refs); -} - -static int record_new_ref(struct send_ctx *sctx) -{ - int ret; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_new_ref, sctx); + ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_deleted_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_new_refs, + &sctx->new_refs, name, dir, dir_gen, + sctx); + } out: return ret; } -static int record_deleted_ref(struct send_ctx *sctx) +static int record_deleted_ref_if_needed(int num, u64 dir, int index, + struct fs_path *name, void *ctx) { - int ret; + int ret = 0; + struct send_ctx *sctx = ctx; + struct rb_node *node = NULL; + struct recorded_ref data; + struct recorded_ref *ref; + u64 dir_gen; - ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_deleted_ref, sctx); + ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, + NULL, NULL, NULL, NULL); if (ret < 0) goto out; - ret = 0; + data.dir = dir; + data.dir_gen = dir_gen; + set_ref_path(&data, name); + node = rb_find(&data, &sctx->rbtree_new_refs, rbtree_ref_comp); + if (node) { + ref = rb_entry(node, struct recorded_ref, node); + recorded_ref_free(ref); + } else { + ret = record_ref_in_tree(&sctx->rbtree_deleted_refs, + &sctx->deleted_refs, name, dir, + dir_gen, sctx); + } out: return ret; } -struct find_ref_ctx { - u64 dir; - u64 dir_gen; - struct btrfs_root *root; - struct fs_path *name; - int found_idx; -}; - -static int __find_iref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx_) -{ - struct find_ref_ctx *ctx = ctx_; - u64 dir_gen; - int ret; - - if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) && - strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) { - /* - * To avoid doing extra lookups we'll only do this if everything - * else matches. - */ - ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - if (dir_gen != ctx->dir_gen) - return 0; - ctx->found_idx = num; - return 1; - } - return 0; -} - -static int find_iref(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *key, - u64 dir, u64 dir_gen, struct fs_path *name) +static int record_new_ref(struct send_ctx *sctx) { int ret; - struct find_ref_ctx ctx; - - ctx.dir = dir; - ctx.name = name; - ctx.dir_gen = dir_gen; - ctx.found_idx = -1; - ctx.root = root; - ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx); + ret = iterate_inode_ref(sctx->send_root, sctx->left_path, + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - return ret; - - if (ctx.found_idx == -1) - return -ENOENT; - - return ctx.found_idx; -} - -static int __record_changed_new_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) -{ - u64 dir_gen; - int ret; - struct send_ctx *sctx = ctx; - - ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_new_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + goto out; + ret = 0; +out: return ret; } -static int __record_changed_deleted_ref(int num, u64 dir, int index, - struct fs_path *name, - void *ctx) +static int record_deleted_ref(struct send_ctx *sctx) { - u64 dir_gen; int ret; - struct send_ctx *sctx = ctx; - ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL, - NULL, NULL, NULL); - if (ret) - return ret; - - ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key, - dir, dir_gen, name); - if (ret == -ENOENT) - ret = __record_deleted_ref(num, dir, index, name, sctx); - else if (ret > 0) - ret = 0; + ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, + sctx->cmp_key, 0, record_deleted_ref_if_needed, + sctx); + if (ret < 0) + goto out; + ret = 0; +out: return ret; } @@ -4531,11 +4527,11 @@ static int record_changed_ref(struct send_ctx *sctx) int ret = 0; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, - sctx->cmp_key, 0, __record_changed_new_ref, sctx); + sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) goto out; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, - sctx->cmp_key, 0, __record_changed_deleted_ref, sctx); + sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) goto out; ret = 0; @@ -4551,13 +4547,12 @@ out: static int process_all_refs(struct send_ctx *sctx, enum btrfs_compare_tree_result cmd) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; iterate_inode_ref_t cb; int pending_move = 0; @@ -4567,10 +4562,10 @@ static int process_all_refs(struct send_ctx *sctx, if (cmd == BTRFS_COMPARE_TREE_NEW) { root = sctx->send_root; - cb = __record_new_ref; + cb = record_new_ref_if_needed; } else if (cmd == BTRFS_COMPARE_TREE_DELETED) { root = sctx->parent_root; - cb = __record_deleted_ref; + cb = record_deleted_ref_if_needed; } else { btrfs_err(sctx->send_root->fs_info, "Wrong command %d in process_all_refs", cmd); @@ -4581,24 +4576,7 @@ static int process_all_refs(struct send_ctx *sctx, key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || (found_key.type != BTRFS_INODE_REF_KEY && found_key.type != BTRFS_INODE_EXTREF_KEY)) @@ -4607,8 +4585,11 @@ static int process_all_refs(struct send_ctx *sctx, ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } btrfs_release_path(path); @@ -4870,13 +4851,12 @@ out: static int process_all_new_xattrs(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; path = alloc_path_for_send(); if (!path) @@ -4887,39 +4867,21 @@ static int process_all_new_xattrs(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = iterate_dir_item(root, path, __process_new_xattr, sctx); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -4931,14 +4893,28 @@ static inline u64 max_send_read_size(const struct send_ctx *sctx) static int put_data_header(struct send_ctx *sctx, u32 len) { - struct btrfs_tlv_header *hdr; + if (WARN_ON_ONCE(sctx->put_data)) + return -EINVAL; + sctx->put_data = true; + if (sctx->proto >= 2) { + /* + * Since v2, the data attribute header doesn't include a length, + * it is implicitly to the end of the command. + */ + if (sctx->send_max_size - sctx->send_size < sizeof(__le16) + len) + return -EOVERFLOW; + put_unaligned_le16(BTRFS_SEND_A_DATA, sctx->send_buf + sctx->send_size); + sctx->send_size += sizeof(__le16); + } else { + struct btrfs_tlv_header *hdr; - if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) - return -EOVERFLOW; - hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); - put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); - put_unaligned_le16(len, &hdr->tlv_len); - sctx->send_size += sizeof(*hdr); + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + } return 0; } @@ -4946,7 +4922,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; struct page *page; pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; @@ -4957,40 +4932,33 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - last_index = (offset + len - 1) >> PAGE_SHIFT; - /* initial readahead */ - memset(&sctx->ra, 0, sizeof(struct file_ra_state)); - file_ra_state_init(&sctx->ra, inode->i_mapping); - while (index <= last_index) { unsigned cur_len = min_t(unsigned, len, PAGE_SIZE - pg_offset); - page = find_lock_page(inode->i_mapping, index); + page = find_lock_page(sctx->cur_inode->i_mapping, index); if (!page) { - page_cache_sync_readahead(inode->i_mapping, &sctx->ra, - NULL, index, last_index + 1 - index); + page_cache_sync_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, index, + last_index + 1 - index); - page = find_or_create_page(inode->i_mapping, index, - GFP_KERNEL); + page = find_or_create_page(sctx->cur_inode->i_mapping, + index, GFP_KERNEL); if (!page) { ret = -ENOMEM; break; } } - if (PageReadahead(page)) { - page_cache_async_readahead(inode->i_mapping, &sctx->ra, - NULL, page, index, last_index + 1 - index); - } + if (PageReadahead(page)) + page_cache_async_readahead(sctx->cur_inode->i_mapping, + &sctx->ra, NULL, page_folio(page), + index, last_index + 1 - index); if (!PageUptodate(page)) { - btrfs_readpage(NULL, page); + btrfs_read_folio(NULL, page_folio(page)); lock_page(page); if (!PageUptodate(page)) { unlock_page(page); @@ -5013,7 +4981,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) len -= cur_len; sctx->send_size += cur_len; } - iput(inode); + return ret; } @@ -5089,7 +5057,7 @@ static int send_clone(struct send_ctx *sctx, if (clone_root->root == sctx->send_root) { ret = get_inode_info(sctx->send_root, clone_root->ino, NULL, - &gen, NULL, NULL, NULL, NULL); + &gen, NULL, NULL, NULL, NULL, NULL); if (ret < 0) goto out; ret = get_cur_path(sctx, clone_root->ino, gen, p); @@ -5216,16 +5184,250 @@ tlv_put_failure: return ret; } -static int send_extent_data(struct send_ctx *sctx, - const u64 offset, - const u64 len) +static int send_encoded_inline_extent(struct send_ctx *sctx, + struct btrfs_path *path, u64 offset, + u64 len) { + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 ram_bytes; + size_t inline_size; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, ei); + inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + ram_bytes - offset, len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, ram_bytes); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, offset - key.offset); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + + ret = put_data_header(sctx, inline_size); + if (ret < 0) + goto out; + read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, + btrfs_file_extent_inline_start(ei), inline_size); + sctx->send_size += inline_size; + + ret = send_cmd(sctx); + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, + u64 offset, u64 len) +{ + struct btrfs_root *root = sctx->send_root; + struct btrfs_fs_info *fs_info = root->fs_info; + struct inode *inode; + struct fs_path *fspath; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_key key; + struct btrfs_file_extent_item *ei; + u64 disk_bytenr, disk_num_bytes; + u32 data_offset; + struct btrfs_cmd_header *hdr; + u32 crc; + int ret; + + inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + fspath = fs_path_alloc(); + if (!fspath) { + ret = -ENOMEM; + goto out; + } + + ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); + if (ret < 0) + goto out; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); + if (ret < 0) + goto out; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); + disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, ei); + + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, fspath); + TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_FILE_LEN, + min(key.offset + btrfs_file_extent_num_bytes(leaf, ei) - offset, + len)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_LEN, + btrfs_file_extent_ram_bytes(leaf, ei)); + TLV_PUT_U64(sctx, BTRFS_SEND_A_UNENCODED_OFFSET, + offset - key.offset + btrfs_file_extent_offset(leaf, ei)); + ret = btrfs_encoded_io_compression_from_extent(fs_info, + btrfs_file_extent_compression(leaf, ei)); + if (ret < 0) + goto out; + TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); + TLV_PUT_U32(sctx, BTRFS_SEND_A_ENCRYPTION, 0); + + ret = put_data_header(sctx, disk_num_bytes); + if (ret < 0) + goto out; + + /* + * We want to do I/O directly into the send buffer, so get the next page + * boundary in the send buffer. This means that there may be a gap + * between the beginning of the command and the file data. + */ + data_offset = ALIGN(sctx->send_size, PAGE_SIZE); + if (data_offset > sctx->send_max_size || + sctx->send_max_size - data_offset < disk_num_bytes) { + ret = -EOVERFLOW; + goto out; + } + + /* + * Note that send_buf is a mapping of send_buf_pages, so this is really + * reading into send_buf. + */ + ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), offset, + disk_bytenr, disk_num_bytes, + sctx->send_buf_pages + + (data_offset >> PAGE_SHIFT)); + if (ret) + goto out; + + hdr = (struct btrfs_cmd_header *)sctx->send_buf; + hdr->len = cpu_to_le32(sctx->send_size + disk_num_bytes - sizeof(*hdr)); + hdr->crc = 0; + crc = btrfs_crc32c(0, sctx->send_buf, sctx->send_size); + crc = btrfs_crc32c(crc, sctx->send_buf + data_offset, disk_num_bytes); + hdr->crc = cpu_to_le32(crc); + + ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, + &sctx->send_off); + if (!ret) { + ret = write_buf(sctx->send_filp, sctx->send_buf + data_offset, + disk_num_bytes, &sctx->send_off); + } + sctx->send_size = 0; + sctx->put_data = false; + +tlv_put_failure: +out: + fs_path_free(fspath); + iput(inode); + return ret; +} + +static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, + const u64 offset, const u64 len) +{ + const u64 end = offset + len; + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *ei; u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if ((sctx->flags & BTRFS_SEND_FLAG_COMPRESSED) && + btrfs_file_extent_compression(leaf, ei) != BTRFS_COMPRESS_NONE) { + bool is_inline = (btrfs_file_extent_type(leaf, ei) == + BTRFS_FILE_EXTENT_INLINE); + + /* + * Send the compressed extent unless the compressed data is + * larger than the decompressed data. This can happen if we're + * not sending the entire extent, either because it has been + * partially overwritten/truncated or because this is a part of + * the extent that we couldn't clone in clone_range(). + */ + if (is_inline && + btrfs_file_extent_inline_item_len(leaf, + path->slots[0]) <= len) { + return send_encoded_inline_extent(sctx, path, offset, + len); + } else if (!is_inline && + btrfs_file_extent_disk_num_bytes(leaf, ei) <= len) { + return send_encoded_extent(sctx, path, offset, len); + } + } + + if (sctx->cur_inode == NULL) { + struct btrfs_root *root = sctx->send_root; + + sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root); + if (IS_ERR(sctx->cur_inode)) { + int err = PTR_ERR(sctx->cur_inode); + + sctx->cur_inode = NULL; + return err; + } + memset(&sctx->ra, 0, sizeof(struct file_ra_state)); + file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); + + /* + * It's very likely there are no pages from this inode in the page + * cache, so after reading extents and sending their data, we clean + * the page cache to avoid trashing the page cache (adding pressure + * to the page cache and forcing eviction of other data more useful + * for applications). + * + * We decide if we should clean the page cache simply by checking + * if the inode's mapping nrpages is 0 when we first open it, and + * not by using something like filemap_range_has_page() before + * reading an extent because when we ask the readahead code to + * read a given file range, it may (and almost always does) read + * pages from beyond that range (see the documentation for + * page_cache_sync_readahead()), so it would not be reliable, + * because after reading the first extent future calls to + * filemap_range_has_page() would return true because the readahead + * on the previous extent resulted in reading pages of the current + * extent as well. + */ + sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0); + sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE); + } + while (sent < len) { u64 size = min(len - sent, read_size); int ret; @@ -5235,6 +5437,37 @@ static int send_extent_data(struct send_ctx *sctx, return ret; sent += size; } + + if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) { + /* + * Always operate only on ranges that are a multiple of the page + * size. This is not only to prevent zeroing parts of a page in + * the case of subpage sector size, but also to guarantee we evict + * pages, as passing a range that is smaller than page size does + * not evict the respective page (only zeroes part of its content). + * + * Always start from the end offset of the last range cleared. + * This is because the readahead code may (and very often does) + * reads pages beyond the range we request for readahead. So if + * we have an extent layout like this: + * + * [ extent A ] [ extent B ] [ extent C ] + * + * When we ask page_cache_sync_readahead() to read extent A, it + * may also trigger reads for pages of extent B. If we are doing + * an incremental send and extent B has not changed between the + * parent and send snapshots, some or all of its pages may end + * up being read and placed in the page cache. So when truncating + * the page cache we always start from the end offset of the + * previously processed extent up to the end of the current + * extent. + */ + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + end - 1); + sctx->page_cache_clear_start = end; + } + return 0; } @@ -5296,12 +5529,9 @@ out: return ret; } -static int clone_range(struct send_ctx *sctx, - struct clone_root *clone_root, - const u64 disk_byte, - u64 data_offset, - u64 offset, - u64 len) +static int clone_range(struct send_ctx *sctx, struct btrfs_path *dst_path, + struct clone_root *clone_root, const u64 disk_byte, + u64 data_offset, u64 offset, u64 len) { struct btrfs_path *path; struct btrfs_key key; @@ -5325,7 +5555,7 @@ static int clone_range(struct send_ctx *sctx, */ if (clone_root->offset == 0 && len == sctx->send_root->fs_info->sectorsize) - return send_extent_data(sctx, offset, len); + return send_extent_data(sctx, dst_path, offset, len); path = alloc_path_for_send(); if (!path) @@ -5336,7 +5566,8 @@ static int clone_range(struct send_ctx *sctx, * accept clones from these extents. */ ret = __get_inode_info(clone_root->root, path, clone_root->ino, - &clone_src_i_size, NULL, NULL, NULL, NULL, NULL); + &clone_src_i_size, NULL, NULL, NULL, NULL, NULL, + NULL); btrfs_release_path(path); if (ret < 0) goto out; @@ -5422,7 +5653,8 @@ static int clone_range(struct send_ctx *sctx, if (hole_len > len) hole_len = len; - ret = send_extent_data(sctx, offset, hole_len); + ret = send_extent_data(sctx, dst_path, offset, + hole_len); if (ret < 0) goto out; @@ -5495,14 +5727,16 @@ static int clone_range(struct send_ctx *sctx, if (ret < 0) goto out; } - ret = send_extent_data(sctx, offset + slen, + ret = send_extent_data(sctx, dst_path, + offset + slen, clone_len - slen); } else { ret = send_clone(sctx, offset, clone_len, clone_root); } } else { - ret = send_extent_data(sctx, offset, clone_len); + ret = send_extent_data(sctx, dst_path, offset, + clone_len); } if (ret < 0) @@ -5534,7 +5768,7 @@ next: } if (len > 0) - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, dst_path, offset, len); else ret = 0; out: @@ -5565,10 +5799,10 @@ static int send_write_or_clone(struct send_ctx *sctx, struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); - ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, end - offset); + ret = clone_range(sctx, path, clone_root, disk_byte, + data_offset, offset, end - offset); } else { - ret = send_extent_data(sctx, offset, end - offset); + ret = send_extent_data(sctx, path, offset, end - offset); } sctx->cur_inode_next_write_offset = end; return ret; @@ -5965,13 +6199,12 @@ out: static int process_all_extents(struct send_ctx *sctx) { - int ret; + int ret = 0; + int iter_ret = 0; struct btrfs_root *root; struct btrfs_path *path; struct btrfs_key key; struct btrfs_key found_key; - struct extent_buffer *eb; - int slot; root = sctx->send_root; path = alloc_path_for_send(); @@ -5981,41 +6214,21 @@ static int process_all_extents(struct send_ctx *sctx) key.objectid = sctx->cmp_key->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto out; - - while (1) { - eb = path->nodes[0]; - slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - ret = 0; - break; - } - continue; - } - - btrfs_item_key_to_cpu(eb, &found_key, slot); - + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { if (found_key.objectid != key.objectid || found_key.type != key.type) { ret = 0; - goto out; + break; } ret = process_extent(sctx, path, &found_key); if (ret < 0) - goto out; - - path->slots[0]++; + break; } + /* Catch error found during iteration */ + if (iter_ret < 0) + ret = iter_ret; -out: btrfs_free_path(path); return ret; } @@ -6049,11 +6262,14 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) u64 left_mode; u64 left_uid; u64 left_gid; + u64 left_fileattr; u64 right_mode; u64 right_uid; u64 right_gid; + u64 right_fileattr; int need_chmod = 0; int need_chown = 0; + bool need_fileattr = false; int need_truncate = 1; int pending_move = 0; int refs_processed = 0; @@ -6087,7 +6303,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) goto out; ret = get_inode_info(sctx->send_root, sctx->cur_ino, NULL, NULL, - &left_mode, &left_uid, &left_gid, NULL); + &left_mode, &left_uid, &left_gid, NULL, &left_fileattr); if (ret < 0) goto out; @@ -6102,7 +6318,7 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) ret = get_inode_info(sctx->parent_root, sctx->cur_ino, &old_size, NULL, &right_mode, &right_uid, - &right_gid, NULL); + &right_gid, NULL, &right_fileattr); if (ret < 0) goto out; @@ -6110,6 +6326,8 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) need_chown = 1; if (!S_ISLNK(sctx->cur_inode_mode) && left_mode != right_mode) need_chmod = 1; + if (!S_ISLNK(sctx->cur_inode_mode) && left_fileattr != right_fileattr) + need_fileattr = true; if ((old_size == sctx->cur_inode_size) || (sctx->cur_inode_size > old_size && sctx->cur_inode_next_write_offset == sctx->cur_inode_size)) @@ -6153,6 +6371,12 @@ static int finish_inode_if_needed(struct send_ctx *sctx, int at_end) if (ret < 0) goto out; } + if (need_fileattr) { + ret = send_fileattr(sctx, sctx->cur_ino, sctx->cur_inode_gen, + left_fileattr); + if (ret < 0) + goto out; + } ret = send_capabilities(sctx); if (ret < 0) @@ -6193,8 +6417,13 @@ static int record_parent_ref(int num, u64 dir, int index, struct fs_path *name, { struct parent_paths_ctx *ppctx = ctx; - return record_ref(ppctx->sctx->parent_root, dir, name, ppctx->sctx, - ppctx->refs); + /* + * Pass 0 as the generation for the directory, we don't care about it + * here as we have no new references to add, we just want to delete all + * references for an inode. + */ + return record_ref_in_tree(&ppctx->sctx->rbtree_deleted_refs, ppctx->refs, + name, dir, 0, ppctx->sctx); } /* @@ -6205,8 +6434,11 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) { LIST_HEAD(deleted_refs); struct btrfs_path *path; + struct btrfs_root *root = sctx->parent_root; struct btrfs_key key; + struct btrfs_key found_key; struct parent_paths_ctx ctx; + int iter_ret = 0; int ret; path = alloc_path_for_send(); @@ -6216,39 +6448,26 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) key.objectid = sctx->cur_ino; key.type = BTRFS_INODE_REF_KEY; key.offset = 0; - ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0); - if (ret < 0) - goto out; ctx.refs = &deleted_refs; ctx.sctx = sctx; - while (true) { - struct extent_buffer *eb = path->nodes[0]; - int slot = path->slots[0]; - - if (slot >= btrfs_header_nritems(eb)) { - ret = btrfs_next_leaf(sctx->parent_root, path); - if (ret < 0) - goto out; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(eb, &key, slot); - if (key.objectid != sctx->cur_ino) + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + if (found_key.objectid != key.objectid) break; - if (key.type != BTRFS_INODE_REF_KEY && - key.type != BTRFS_INODE_EXTREF_KEY) + if (found_key.type != key.type && + found_key.type != BTRFS_INODE_EXTREF_KEY) break; - ret = iterate_inode_ref(sctx->parent_root, path, &key, 1, + ret = iterate_inode_ref(root, path, &found_key, 1, record_parent_ref, &ctx); if (ret < 0) goto out; - - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto out; } while (!list_empty(&deleted_refs)) { @@ -6258,9 +6477,7 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx) ret = send_unlink(sctx, ref->full_path); if (ret < 0) goto out; - fs_path_free(ref->full_path); - list_del(&ref->list); - kfree(ref); + recorded_ref_free(ref); } ret = 0; out: @@ -6270,6 +6487,30 @@ out: return ret; } +static void close_current_inode(struct send_ctx *sctx) +{ + u64 i_size; + + if (sctx->cur_inode == NULL) + return; + + i_size = i_size_read(sctx->cur_inode); + + /* + * If we are doing an incremental send, we may have extents between the + * last processed extent and the i_size that have not been processed + * because they haven't changed but we may have read some of their pages + * through readahead, see the comments at send_extent_data(). + */ + if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size) + truncate_inode_pages_range(&sctx->cur_inode->i_data, + sctx->page_cache_clear_start, + round_up(i_size, PAGE_SIZE) - 1); + + iput(sctx->cur_inode); + sctx->cur_inode = NULL; +} + static int changed_inode(struct send_ctx *sctx, enum btrfs_compare_tree_result result) { @@ -6280,8 +6521,10 @@ static int changed_inode(struct send_ctx *sctx, u64 left_gen = 0; u64 right_gen = 0; + close_current_inode(sctx); + sctx->cur_ino = key->objectid; - sctx->cur_inode_new_gen = 0; + sctx->cur_inode_new_gen = false; sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; @@ -6322,7 +6565,7 @@ static int changed_inode(struct send_ctx *sctx, */ if (left_gen != right_gen && sctx->cur_ino != BTRFS_FIRST_FREE_OBJECTID) - sctx->cur_inode_new_gen = 1; + sctx->cur_inode_new_gen = true; } /* @@ -6354,8 +6597,8 @@ static int changed_inode(struct send_ctx *sctx, if (result == BTRFS_COMPARE_TREE_NEW) { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6366,8 +6609,8 @@ static int changed_inode(struct send_ctx *sctx, ret = send_create_inode_if_needed(sctx); } else if (result == BTRFS_COMPARE_TREE_DELETED) { sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6385,8 +6628,8 @@ static int changed_inode(struct send_ctx *sctx, * First, process the inode as if it was deleted. */ sctx->cur_inode_gen = right_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_deleted = 1; + sctx->cur_inode_new = false; + sctx->cur_inode_deleted = true; sctx->cur_inode_size = btrfs_inode_size( sctx->right_path->nodes[0], right_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6400,8 +6643,8 @@ static int changed_inode(struct send_ctx *sctx, * Now process the inode as if it was new. */ sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 1; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = true; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6433,9 +6676,9 @@ static int changed_inode(struct send_ctx *sctx, goto out; } else { sctx->cur_inode_gen = left_gen; - sctx->cur_inode_new = 0; - sctx->cur_inode_new_gen = 0; - sctx->cur_inode_deleted = 0; + sctx->cur_inode_new = false; + sctx->cur_inode_new_gen = false; + sctx->cur_inode_deleted = false; sctx->cur_inode_size = btrfs_inode_size( sctx->left_path->nodes[0], left_ii); sctx->cur_inode_mode = btrfs_inode_mode( @@ -6548,12 +6791,12 @@ static int dir_changed(struct send_ctx *sctx, u64 dir) int ret; ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL, - NULL, NULL); + NULL, NULL, NULL); if (ret) return ret; ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL, - NULL, NULL, NULL); + NULL, NULL, NULL, NULL); if (ret) return ret; @@ -7549,6 +7792,10 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } else { sctx->proto = 1; } + if ((arg->flags & BTRFS_SEND_FLAG_COMPRESSED) && sctx->proto < 2) { + ret = -EINVAL; + goto out; + } sctx->send_filp = fget(arg->send_fd); if (!sctx->send_filp) { @@ -7568,8 +7815,31 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->clone_roots_cnt = arg->clone_sources_count; - sctx->send_max_size = BTRFS_SEND_BUF_SIZE; - sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + if (sctx->proto >= 2) { + u32 send_buf_num_pages; + + sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE); + sctx->send_buf = vmalloc(sctx->send_max_size); + if (!sctx->send_buf) { + ret = -ENOMEM; + goto out; + } + send_buf_num_pages = sctx->send_max_size >> PAGE_SHIFT; + sctx->send_buf_pages = kcalloc(send_buf_num_pages, + sizeof(*sctx->send_buf_pages), + GFP_KERNEL); + if (!sctx->send_buf_pages) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < send_buf_num_pages; i++) { + sctx->send_buf_pages[i] = + vmalloc_to_page(sctx->send_buf + (i << PAGE_SHIFT)); + } + } else { + sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V1; + sctx->send_buf = kvmalloc(sctx->send_max_size, GFP_KERNEL); + } if (!sctx->send_buf) { ret = -ENOMEM; goto out; @@ -7578,6 +7848,8 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; + sctx->rbtree_new_refs = RB_ROOT; + sctx->rbtree_deleted_refs = RB_ROOT; sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), arg->clone_sources_count + 1, @@ -7762,10 +8034,13 @@ out: fput(sctx->send_filp); kvfree(sctx->clone_roots); + kfree(sctx->send_buf_pages); kvfree(sctx->send_buf); name_cache_free(sctx); + close_current_inode(sctx); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 08602fdd600a..4bb4e6a638cb 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -7,12 +7,19 @@ #ifndef BTRFS_SEND_H #define BTRFS_SEND_H -#include "ctree.h" +#include <linux/types.h> #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" -#define BTRFS_SEND_STREAM_VERSION 1 +#define BTRFS_SEND_STREAM_VERSION 2 -#define BTRFS_SEND_BUF_SIZE SZ_64K +/* + * In send stream v1, no command is larger than 64K. In send stream v2, no limit + * should be assumed. + */ +#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K + +struct inode; +struct btrfs_ioctl_send_args; enum btrfs_tlv_type { BTRFS_TLV_U8, @@ -46,87 +53,117 @@ struct btrfs_tlv_header { /* commands */ enum btrfs_send_cmd { - BTRFS_SEND_C_UNSPEC, + BTRFS_SEND_C_UNSPEC = 0, /* Version 1 */ - BTRFS_SEND_C_SUBVOL, - BTRFS_SEND_C_SNAPSHOT, + BTRFS_SEND_C_SUBVOL = 1, + BTRFS_SEND_C_SNAPSHOT = 2, - BTRFS_SEND_C_MKFILE, - BTRFS_SEND_C_MKDIR, - BTRFS_SEND_C_MKNOD, - BTRFS_SEND_C_MKFIFO, - BTRFS_SEND_C_MKSOCK, - BTRFS_SEND_C_SYMLINK, + BTRFS_SEND_C_MKFILE = 3, + BTRFS_SEND_C_MKDIR = 4, + BTRFS_SEND_C_MKNOD = 5, + BTRFS_SEND_C_MKFIFO = 6, + BTRFS_SEND_C_MKSOCK = 7, + BTRFS_SEND_C_SYMLINK = 8, - BTRFS_SEND_C_RENAME, - BTRFS_SEND_C_LINK, - BTRFS_SEND_C_UNLINK, - BTRFS_SEND_C_RMDIR, + BTRFS_SEND_C_RENAME = 9, + BTRFS_SEND_C_LINK = 10, + BTRFS_SEND_C_UNLINK = 11, + BTRFS_SEND_C_RMDIR = 12, - BTRFS_SEND_C_SET_XATTR, - BTRFS_SEND_C_REMOVE_XATTR, + BTRFS_SEND_C_SET_XATTR = 13, + BTRFS_SEND_C_REMOVE_XATTR = 14, - BTRFS_SEND_C_WRITE, - BTRFS_SEND_C_CLONE, + BTRFS_SEND_C_WRITE = 15, + BTRFS_SEND_C_CLONE = 16, - BTRFS_SEND_C_TRUNCATE, - BTRFS_SEND_C_CHMOD, - BTRFS_SEND_C_CHOWN, - BTRFS_SEND_C_UTIMES, + BTRFS_SEND_C_TRUNCATE = 17, + BTRFS_SEND_C_CHMOD = 18, + BTRFS_SEND_C_CHOWN = 19, + BTRFS_SEND_C_UTIMES = 20, - BTRFS_SEND_C_END, - BTRFS_SEND_C_UPDATE_EXTENT, - __BTRFS_SEND_C_MAX_V1, + BTRFS_SEND_C_END = 21, + BTRFS_SEND_C_UPDATE_EXTENT = 22, + BTRFS_SEND_C_MAX_V1 = 22, /* Version 2 */ - __BTRFS_SEND_C_MAX_V2, + BTRFS_SEND_C_FALLOCATE = 23, + BTRFS_SEND_C_FILEATTR = 24, + BTRFS_SEND_C_ENCODED_WRITE = 25, + BTRFS_SEND_C_MAX_V2 = 25, /* End */ - __BTRFS_SEND_C_MAX, + BTRFS_SEND_C_MAX = 25, }; -#define BTRFS_SEND_C_MAX (__BTRFS_SEND_C_MAX - 1) /* attributes in send stream */ enum { - BTRFS_SEND_A_UNSPEC, - - BTRFS_SEND_A_UUID, - BTRFS_SEND_A_CTRANSID, - - BTRFS_SEND_A_INO, - BTRFS_SEND_A_SIZE, - BTRFS_SEND_A_MODE, - BTRFS_SEND_A_UID, - BTRFS_SEND_A_GID, - BTRFS_SEND_A_RDEV, - BTRFS_SEND_A_CTIME, - BTRFS_SEND_A_MTIME, - BTRFS_SEND_A_ATIME, - BTRFS_SEND_A_OTIME, - - BTRFS_SEND_A_XATTR_NAME, - BTRFS_SEND_A_XATTR_DATA, - - BTRFS_SEND_A_PATH, - BTRFS_SEND_A_PATH_TO, - BTRFS_SEND_A_PATH_LINK, - - BTRFS_SEND_A_FILE_OFFSET, - BTRFS_SEND_A_DATA, - - BTRFS_SEND_A_CLONE_UUID, - BTRFS_SEND_A_CLONE_CTRANSID, - BTRFS_SEND_A_CLONE_PATH, - BTRFS_SEND_A_CLONE_OFFSET, - BTRFS_SEND_A_CLONE_LEN, - - __BTRFS_SEND_A_MAX, + BTRFS_SEND_A_UNSPEC = 0, + + /* Version 1 */ + BTRFS_SEND_A_UUID = 1, + BTRFS_SEND_A_CTRANSID = 2, + + BTRFS_SEND_A_INO = 3, + BTRFS_SEND_A_SIZE = 4, + BTRFS_SEND_A_MODE = 5, + BTRFS_SEND_A_UID = 6, + BTRFS_SEND_A_GID = 7, + BTRFS_SEND_A_RDEV = 8, + BTRFS_SEND_A_CTIME = 9, + BTRFS_SEND_A_MTIME = 10, + BTRFS_SEND_A_ATIME = 11, + BTRFS_SEND_A_OTIME = 12, + + BTRFS_SEND_A_XATTR_NAME = 13, + BTRFS_SEND_A_XATTR_DATA = 14, + + BTRFS_SEND_A_PATH = 15, + BTRFS_SEND_A_PATH_TO = 16, + BTRFS_SEND_A_PATH_LINK = 17, + + BTRFS_SEND_A_FILE_OFFSET = 18, + /* + * As of send stream v2, this attribute is special: it must be the last + * attribute in a command, its header contains only the type, and its + * length is implicitly the remaining length of the command. + */ + BTRFS_SEND_A_DATA = 19, + + BTRFS_SEND_A_CLONE_UUID = 20, + BTRFS_SEND_A_CLONE_CTRANSID = 21, + BTRFS_SEND_A_CLONE_PATH = 22, + BTRFS_SEND_A_CLONE_OFFSET = 23, + BTRFS_SEND_A_CLONE_LEN = 24, + + BTRFS_SEND_A_MAX_V1 = 24, + + /* Version 2 */ + BTRFS_SEND_A_FALLOCATE_MODE = 25, + + /* + * File attributes from the FS_*_FL namespace (i_flags, xflags), + * translated to BTRFS_INODE_* bits (BTRFS_INODE_FLAG_MASK) and stored + * in btrfs_inode_item::flags (represented by btrfs_inode::flags and + * btrfs_inode::ro_flags). + */ + BTRFS_SEND_A_FILEATTR = 26, + + BTRFS_SEND_A_UNENCODED_FILE_LEN = 27, + BTRFS_SEND_A_UNENCODED_LEN = 28, + BTRFS_SEND_A_UNENCODED_OFFSET = 29, + /* + * COMPRESSION and ENCRYPTION default to NONE (0) if omitted from + * BTRFS_SEND_C_ENCODED_WRITE. + */ + BTRFS_SEND_A_COMPRESSION = 30, + BTRFS_SEND_A_ENCRYPTION = 31, + BTRFS_SEND_A_MAX_V2 = 31, + + /* End */ + BTRFS_SEND_A_MAX = 31, }; -#define BTRFS_SEND_A_MAX (__BTRFS_SEND_A_MAX - 1) -#ifdef __KERNEL__ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg); -#endif #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index b87931a458eb..d0cbeb7ae81c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -9,6 +9,7 @@ #include "ordered-data.h" #include "transaction.h" #include "block-group.h" +#include "zoned.h" /* * HOW DOES SPACE RESERVATION WORK @@ -181,6 +182,43 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) found->full = 0; } +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75) + +/* + * Calculate chunk size depending on volume type (regular or zoned). + */ +static u64 calc_chunk_size(const struct btrfs_fs_info *fs_info, u64 flags) +{ + if (btrfs_is_zoned(fs_info)) + return fs_info->zone_size; + + ASSERT(flags & BTRFS_BLOCK_GROUP_TYPE_MASK); + + if (flags & BTRFS_BLOCK_GROUP_DATA) + return SZ_1G; + else if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + return SZ_32M; + + /* Handle BTRFS_BLOCK_GROUP_METADATA */ + if (fs_info->fs_devices->total_rw_bytes > 50ULL * SZ_1G) + return SZ_1G; + + return SZ_256M; +} + +/* + * Update default chunk size. + */ +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size) +{ + WRITE_ONCE(space_info->chunk_size, chunk_size); +} + static int create_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -202,6 +240,10 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->tickets); INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; + btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + + if (btrfs_is_zoned(info)) + space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -254,7 +296,7 @@ out: void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info) + bool active, struct btrfs_space_info **space_info) { struct btrfs_space_info *found; int factor; @@ -265,6 +307,8 @@ void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, ASSERT(found); spin_lock(&found->lock); found->total_bytes += total_bytes; + if (active) + found->active_total_bytes += total_bytes; found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; @@ -328,6 +372,22 @@ static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, return avail; } +static inline u64 writable_total_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info) +{ + /* + * On regular filesystem, all total_bytes are always writable. On zoned + * filesystem, there may be a limitation imposed by max_active_zones. + * For metadata allocation, we cannot finish an existing active block + * group to avoid a deadlock. Thus, we need to consider only the active + * groups to be writable for metadata space. + */ + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return space_info->total_bytes; + + return space_info->active_total_bytes; +} + int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) @@ -340,9 +400,12 @@ int btrfs_can_overcommit(struct btrfs_fs_info *fs_info, return 0; used = btrfs_space_info_used(space_info, true); - avail = calc_available_free_space(fs_info, space_info, flush); + if (btrfs_is_zoned(fs_info) && (space_info->flags & BTRFS_BLOCK_GROUP_METADATA)) + avail = 0; + else + avail = calc_available_free_space(fs_info, space_info, flush); - if (used + bytes < space_info->total_bytes + avail) + if (used + bytes < writable_total_bytes(fs_info, space_info) + avail) return 1; return 0; } @@ -378,7 +441,7 @@ again: ticket = list_first_entry(head, struct reserve_ticket, list); /* Check and see if our ticket can be satisfied now. */ - if ((used + ticket->bytes <= space_info->total_bytes) || + if ((used + ticket->bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, ticket->bytes, flush)) { btrfs_space_info_update_bytes_may_use(fs_info, @@ -519,7 +582,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; } - trans = (struct btrfs_trans_handle *)current->journal_info; + trans = current->journal_info; /* * If we are doing more ordered than delalloc we need to just wait on @@ -662,6 +725,18 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case ALLOC_CHUNK: case ALLOC_CHUNK_FORCE: + /* + * For metadata space on zoned filesystem, reaching here means we + * don't have enough space left in active_total_bytes. Try to + * activate a block group first, because we may have inactive + * block group already allocated. + */ + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, false); + if (ret < 0) + break; + else if (ret == 1) + break; + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -672,6 +747,23 @@ static void flush_space(struct btrfs_fs_info *fs_info, (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); + + /* + * For metadata space on zoned filesystem, allocating a new chunk + * is not enough. We still need to activate the block * group. + * Active the newly allocated block group by (maybe) finishing + * a block group. + */ + if (ret == 1) { + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); + /* + * Revert to the original ret regardless we could finish + * one block group or not. + */ + if (ret >= 0) + ret = 1; + } + if (ret > 0 || ret == -ENOSPC) ret = 0; break; @@ -709,6 +801,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, { u64 used; u64 avail; + u64 total; u64 to_reclaim = space_info->reclaim_size; lockdep_assert_held(&space_info->lock); @@ -723,8 +816,9 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_fs_info *fs_info, * space. If that's the case add in our overage so we make sure to put * appropriate pressure on the flushing state machine. */ - if (space_info->total_bytes + avail < used) - to_reclaim += used - (space_info->total_bytes + avail); + total = writable_total_bytes(fs_info, space_info); + if (total + avail < used) + to_reclaim += used - (total + avail); return to_reclaim; } @@ -734,9 +828,12 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, { u64 global_rsv_size = fs_info->global_block_rsv.reserved; u64 ordered, delalloc; - u64 thresh = div_factor_fine(space_info->total_bytes, 90); + u64 total = writable_total_bytes(fs_info, space_info); + u64 thresh; u64 used; + thresh = div_factor_fine(total, 90); + lockdep_assert_held(&space_info->lock); /* If we're just plain full then async reclaim just slows us down. */ @@ -798,8 +895,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, BTRFS_RESERVE_FLUSH_ALL); used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_readonly + global_rsv_size; - if (used < space_info->total_bytes) - thresh += space_info->total_bytes - used; + if (used < total) + thresh += total - used; thresh >>= space_info->clamp; used = space_info->bytes_pinned; @@ -1271,7 +1368,7 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, to_reclaim = btrfs_calc_reclaim_metadata_size(fs_info, space_info); /* * This is the priority reclaim path, so to_reclaim could be >0 still - * because we may have only satisified the priority tickets and still + * because we may have only satisfied the priority tickets and still * left non priority tickets on the list. We would then have * to_reclaim but ->bytes == 0. */ @@ -1516,7 +1613,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, * can_overcommit() to ensure we can overcommit to continue. */ if (!pending_tickets && - ((used + orig_bytes <= space_info->total_bytes) || + ((used + orig_bytes <= writable_total_bytes(fs_info, space_info)) || btrfs_can_overcommit(fs_info, space_info, orig_bytes, flush))) { btrfs_space_info_update_bytes_may_use(fs_info, space_info, orig_bytes); diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index d841fed73492..12fd6147f92d 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -3,6 +3,8 @@ #ifndef BTRFS_SPACE_INFO_H #define BTRFS_SPACE_INFO_H +#include "volumes.h" + struct btrfs_space_info { spinlock_t lock; @@ -17,12 +19,22 @@ struct btrfs_space_info { u64 bytes_may_use; /* number of bytes that may be used for delalloc/allocations */ u64 bytes_readonly; /* total bytes that are read only */ + /* Total bytes in the space, but only accounts active block groups. */ + u64 active_total_bytes; u64 bytes_zone_unusable; /* total bytes that are unusable until resetting the device zone */ u64 max_extent_size; /* This will hold the maximum extent size of the space info if we had an ENOSPC in the allocator. */ + /* Chunk size in bytes */ + u64 chunk_size; + + /* + * Once a block group drops below this threshold (percents) we'll + * schedule it for reclaim. + */ + int bg_reclaim_threshold; int clamp; /* Used to scale our threshold for preemptive flushing. The value is >> clamp, so turns @@ -114,7 +126,9 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info); void btrfs_update_space_info(struct btrfs_fs_info *info, u64 flags, u64 total_bytes, u64 bytes_used, u64 bytes_readonly, u64 bytes_zone_unusable, - struct btrfs_space_info **space_info); + bool active, struct btrfs_space_info **space_info); +void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, + u64 chunk_size); struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, u64 flags); u64 __pure btrfs_space_info_used(struct btrfs_space_info *s_info, diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index f429256f56db..12455b2b41de 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -12,15 +12,10 @@ static bool check_setget_bounds(const struct extent_buffer *eb, { const unsigned long member_offset = (unsigned long)ptr + off; - if (member_offset > eb->len) { + if (unlikely(member_offset + size > eb->len)) { btrfs_warn(eb->fs_info, - "bad eb member start: ptr 0x%lx start %llu member offset %lu size %d", - (unsigned long)ptr, eb->start, member_offset, size); - return false; - } - if (member_offset + size > eb->len) { - btrfs_warn(eb->fs_info, - "bad eb member end: ptr 0x%lx start %llu member offset %lu size %d", + "bad eb member %s: ptr 0x%lx start %llu member offset %lu size %d", + (member_offset > eb->len ? "start" : "end"), (unsigned long)ptr, eb->start, member_offset, size); return false; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index ef7ae20d2b77..6fc2b77ae5c3 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -63,6 +63,29 @@ * This means a slightly higher tree locking latency. */ +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +{ + if (fs_info->sectorsize >= PAGE_SIZE) + return false; + + /* + * Only data pages (either through DIO or compression) can have no + * mapping. And if page->mapping->host is data inode, it's subpage. + * As we have ruled our sectorsize >= PAGE_SIZE case already. + */ + if (!page->mapping || !page->mapping->host || + is_data_inode(page->mapping->host)) + return true; + + /* + * Now the only remaining case is metadata, which we only go subpage + * routine if nodesize < PAGE_SIZE. + */ + if (fs_info->nodesize < PAGE_SIZE) + return true; + return false; +} + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize) { unsigned int cur = 0; @@ -100,14 +123,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* - * We have cases like a dummy extent buffer page, which is not mappped + * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ if (page->mapping) ASSERT(PageLocked(page)); /* Either not subpage, or the page already has private attached */ - if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); @@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct btrfs_subpage *subpage; /* Either not subpage, or already detached */ - if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page)) + if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) return; - subpage = (struct btrfs_subpage *)detach_page_private(page); + subpage = detach_page_private(page); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, { struct btrfs_subpage *subpage; - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->mapping); @@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { lock_page(page); return 0; } @@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, struct page *page, u64 start, u32 len) { - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) return unlock_page(page); btrfs_subpage_clamp_range(page, &start, &len); if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) @@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ } \ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ set_page_func(page); \ return; \ } \ @@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ clear_page_func(page); \ return; \ } \ @@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct page *page, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \ + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ return test_page_func(page); \ btrfs_subpage_clamp_range(page, &start, &len); \ return btrfs_subpage_test_##name(fs_info, page, start, len); \ @@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, return; ASSERT(!PageDirty(page)); - if (fs_info->sectorsize == PAGE_SIZE) + if (!btrfs_is_subpage(fs_info, page)) return; ASSERT(PagePrivate(page) && page->private); @@ -708,7 +731,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * It should not have any subpage::writers count. * Can be unlocked by unlock_page(). * This is the most common locked page for __extent_writepage() called - * inside extent_write_cache_pages() or extent_write_full_page(). + * inside extent_write_cache_pages(). * Rarer cases include the @locked_page from extent_write_locked_range(). * * - Page locked by lock_delalloc_pages() @@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, struct btrfs_subpage *subpage; ASSERT(PageLocked(page)); - /* For regular page size case, we just unlock the page */ - if (fs_info->sectorsize == PAGE_SIZE) + /* For non-subpage case, we just unlock the page */ + if (!btrfs_is_subpage(fs_info, page)) return unlock_page(page); ASSERT(PagePrivate(page) && page->private); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 7accb5c40d33..0e80ad336904 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -74,6 +74,8 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); + void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index b228efe8ab6e..4c7089b1681b 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -48,6 +48,7 @@ #include "block-group.h" #include "discard.h" #include "qgroup.h" +#include "raid56.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -72,7 +73,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data); #define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT) /* - * Characters to print to indicate error conditions or uncommon filesystem sate. + * Characters to print to indicate error conditions or uncommon filesystem state. * RO is not an error. */ static const char fs_state_chars[] = { @@ -261,7 +262,7 @@ static struct ratelimit_state printk_limits[] = { RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100), }; -void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) +void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) { char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0"; struct va_format vaf; @@ -292,10 +293,10 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . char statestr[STATE_STRING_BUF_LEN]; btrfs_state_to_string(fs_info, statestr); - printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, + _printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type, fs_info->sb->s_id, statestr, &vaf); } else { - printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); + _printk("%sBTRFS %s: %pV\n", lvl, type, &vaf); } } @@ -763,6 +764,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, compress_force = false; no_compress++; } else { + btrfs_err(info, "unrecognized compression value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -821,8 +824,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_thread_pool: ret = match_int(&args[0], &intarg); if (ret) { + btrfs_err(info, "unrecognized thread_pool value %s", + args[0].from); goto out; } else if (intarg == 0) { + btrfs_err(info, "invalid value 0 for thread_pool"); ret = -EINVAL; goto out; } @@ -883,8 +889,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_ratio: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized metadata_ratio value %s", + args[0].from); goto out; + } info->metadata_ratio = intarg; btrfs_info(info, "metadata ratio %u", info->metadata_ratio); @@ -901,6 +910,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, DISCARD_ASYNC, "turning on async discard"); } else { + btrfs_err(info, "unrecognized discard mode value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -933,6 +944,8 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, btrfs_set_and_info(info, FREE_SPACE_TREE, "enabling free space tree"); } else { + btrfs_err(info, "unrecognized space_cache value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1014,8 +1027,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_check_integrity_print_mask: ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, + "unrecognized check_integrity_print_mask value %s", + args[0].from); goto out; + } info->check_integrity_print_mask = intarg; btrfs_info(info, "check_integrity_print_mask 0x%x", info->check_integrity_print_mask); @@ -1030,13 +1047,15 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, goto out; #endif case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) + if (strcmp(args[0].from, "panic") == 0) { btrfs_set_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else if (strcmp(args[0].from, "bug") == 0) + } else if (strcmp(args[0].from, "bug") == 0) { btrfs_clear_opt(info->mount_opt, PANIC_ON_FATAL_ERROR); - else { + } else { + btrfs_err(info, "unrecognized fatal_errors value %s", + args[0].from); ret = -EINVAL; goto out; } @@ -1044,8 +1063,12 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, case Opt_commit_interval: intarg = 0; ret = match_int(&args[0], &intarg); - if (ret) + if (ret) { + btrfs_err(info, "unrecognized commit_interval value %s", + args[0].from); + ret = -EINVAL; goto out; + } if (intarg == 0) { btrfs_info(info, "using default commit interval %us", @@ -1059,8 +1082,11 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options, break; case Opt_rescue: ret = parse_rescue_options(info, args[0].from); - if (ret < 0) + if (ret < 0) { + btrfs_err(info, "unrecognized rescue value %s", + args[0].from); goto out; + } break; #ifdef CONFIG_BTRFS_DEBUG case Opt_fragment_all: @@ -1903,17 +1929,12 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, old_pool_size, new_pool_size); btrfs_workqueue_set_max(fs_info->workers, new_pool_size); + btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->endio_meta_write_workers, - new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); - btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, - new_pool_size); } static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info, @@ -1986,6 +2007,14 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) if (ret) goto restore; + /* V1 cache is not supported for subpage mount. */ + if (fs_info->sectorsize < PAGE_SIZE && btrfs_test_opt(fs_info, SPACE_CACHE)) { + btrfs_warn(fs_info, + "v1 space cache is not supported for page size %lu with sectorsize %u", + PAGE_SIZE, fs_info->sectorsize); + ret = -EINVAL; + goto restore; + } btrfs_remount_begin(fs_info, old_opts, *flags); btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, old_thread_pool_size); @@ -2214,12 +2243,8 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, if (type & BTRFS_BLOCK_GROUP_RAID0) num_stripes = nr_devices; - else if (type & BTRFS_BLOCK_GROUP_RAID1) - num_stripes = 2; - else if (type & BTRFS_BLOCK_GROUP_RAID1C3) - num_stripes = 3; - else if (type & BTRFS_BLOCK_GROUP_RAID1C4) - num_stripes = 4; + else if (type & BTRFS_BLOCK_GROUP_RAID1_MASK) + num_stripes = rattr->ncopies; else if (type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = 4; @@ -2243,17 +2268,13 @@ static inline int btrfs_calc_avail_data_space(struct btrfs_fs_info *fs_info, avail_space = rounddown(avail_space, BTRFS_STRIPE_LEN); /* - * In order to avoid overwriting the superblock on the drive, - * btrfs starts at an offset of at least 1MB when doing chunk - * allocation. - * - * This ensures we have at least min_stripe_size free space - * after excluding 1MB. + * Ensure we have at least min_stripe_size on top of the + * reserved space on the device. */ - if (avail_space <= SZ_1M + min_stripe_size) + if (avail_space <= BTRFS_DEVICE_RANGE_RESERVED + min_stripe_size) continue; - avail_space -= SZ_1M; + avail_space -= BTRFS_DEVICE_RANGE_RESERVED; devices_info[i].dev = device; devices_info[i].max_avail = avail_space; @@ -2671,13 +2692,9 @@ static int __init init_btrfs_fs(void) if (err) goto free_delayed_ref; - err = btrfs_end_io_wq_init(); - if (err) - goto free_prelim_ref; - err = btrfs_interface_init(); if (err) - goto free_end_io_wq; + goto free_prelim_ref; btrfs_print_mod_info(); @@ -2693,8 +2710,6 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); -free_end_io_wq: - btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: @@ -2732,7 +2747,6 @@ static void __exit exit_btrfs_fs(void) extent_state_cache_exit(); extent_io_exit(); btrfs_interface_exit(); - btrfs_end_io_wq_exit(); unregister_filesystem(&btrfs_fs_type); btrfs_exit_sysfs(); btrfs_cleanup_fs_uuids(); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index ba78ca5aabbb..d5d0717fd09a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -21,6 +21,7 @@ #include "space-info.h" #include "block-group.h" #include "qgroup.h" +#include "misc.h" /* * Structure name Path @@ -61,6 +62,10 @@ struct raid_kobject { .store = _store, \ } +#define BTRFS_ATTR_W(_prefix, _name, _store) \ + static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ + __INIT_KOBJ_ATTR(_name, 0200, NULL, _store) + #define BTRFS_ATTR_RW(_prefix, _name, _show, _store) \ static struct kobj_attribute btrfs_attr_##_prefix##_##_name = \ __INIT_KOBJ_ATTR(_name, 0644, _show, _store) @@ -92,6 +97,7 @@ static struct btrfs_feature_attr btrfs_attr_features_##_name = { \ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); +static struct kobject *get_btrfs_kobj(struct kobject *kobj); static struct btrfs_feature_attr *to_btrfs_feature_attr(struct kobj_attribute *a) { @@ -270,12 +276,10 @@ static umode_t btrfs_feature_visible(struct kobject *kobj, return mode; } -BTRFS_FEAT_ATTR_INCOMPAT(mixed_backref, MIXED_BACKREF); BTRFS_FEAT_ATTR_INCOMPAT(default_subvol, DEFAULT_SUBVOL); BTRFS_FEAT_ATTR_INCOMPAT(mixed_groups, MIXED_GROUPS); BTRFS_FEAT_ATTR_INCOMPAT(compress_lzo, COMPRESS_LZO); BTRFS_FEAT_ATTR_INCOMPAT(compress_zstd, COMPRESS_ZSTD); -BTRFS_FEAT_ATTR_INCOMPAT(big_metadata, BIG_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); @@ -283,9 +287,10 @@ BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); BTRFS_FEAT_ATTR_INCOMPAT(metadata_uuid, METADATA_UUID); BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); BTRFS_FEAT_ATTR_INCOMPAT(raid1c34, RAID1C34); -#ifdef CONFIG_BTRFS_DEBUG -/* Remove once support for zoned allocation is feature complete */ +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_INCOMPAT(zoned, ZONED); +#endif +#ifdef CONFIG_BTRFS_DEBUG /* Remove once support for extent tree v2 is feature complete */ BTRFS_FEAT_ATTR_INCOMPAT(extent_tree_v2, EXTENT_TREE_V2); #endif @@ -296,17 +301,15 @@ BTRFS_FEAT_ATTR_COMPAT_RO(verity, VERITY); /* * Features which depend on feature bits and may differ between each fs. * - * /sys/fs/btrfs/features - all available features implemeted by this version + * /sys/fs/btrfs/features - all available features implemented by this version * /sys/fs/btrfs/UUID/features - features of the fs which are enabled or * can be changed on a mounted filesystem. */ static struct attribute *btrfs_supported_feature_attrs[] = { - BTRFS_FEAT_ATTR_PTR(mixed_backref), BTRFS_FEAT_ATTR_PTR(default_subvol), BTRFS_FEAT_ATTR_PTR(mixed_groups), BTRFS_FEAT_ATTR_PTR(compress_lzo), BTRFS_FEAT_ATTR_PTR(compress_zstd), - BTRFS_FEAT_ATTR_PTR(big_metadata), BTRFS_FEAT_ATTR_PTR(extended_iref), BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), @@ -314,8 +317,10 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(metadata_uuid), BTRFS_FEAT_ATTR_PTR(free_space_tree), BTRFS_FEAT_ATTR_PTR(raid1c34), -#ifdef CONFIG_BTRFS_DEBUG +#ifdef CONFIG_BLK_DEV_ZONED BTRFS_FEAT_ATTR_PTR(zoned), +#endif +#ifdef CONFIG_BTRFS_DEBUG BTRFS_FEAT_ATTR_PTR(extent_tree_v2), #endif #ifdef CONFIG_FS_VERITY @@ -394,11 +399,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* 4K sector size is also supported with 64K page size */ - if (PAGE_SIZE == SZ_64K) + /* An artificial limit to only support 4K and PAGE_SIZE */ + if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); - - /* Only sectorsize == PAGE_SIZE is now supported */ ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); return ret; @@ -711,6 +714,112 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ } \ BTRFS_ATTR(space_info, field, btrfs_space_info_show_##field) +static ssize_t btrfs_chunk_size_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_space_info *sinfo = to_space_info(kobj); + + return sysfs_emit(buf, "%llu\n", READ_ONCE(sinfo->chunk_size)); +} + +/* + * Store new chunk size in space info. Can be called on a read-only filesystem. + * + * If the new chunk size value is larger than 10% of free space it is reduced + * to match that limit. Alignment must be to 256M and the system chunk size + * cannot be set. + */ +static ssize_t btrfs_chunk_size_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + char *retptr; + u64 val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!fs_info->fs_devices) + return -EINVAL; + + if (btrfs_is_zoned(fs_info)) + return -EINVAL; + + /* System block type must not be changed. */ + if (space_info->flags & BTRFS_BLOCK_GROUP_SYSTEM) + return -EPERM; + + val = memparse(buf, &retptr); + /* There could be trailing '\n', also catch any typos after the value */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || val == 0) + return -EINVAL; + + val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE); + + /* Limit stripe size to 10% of available space. */ + val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val); + + /* Must be multiple of 256M. */ + val &= ~((u64)SZ_256M - 1); + + /* Must be at least 256M. */ + if (val < SZ_256M) + return -EINVAL; + + btrfs_update_space_info_chunk_size(space_info, val); + + return len; +} + +#ifdef CONFIG_BTRFS_DEBUG +/* + * Request chunk allocation with current chunk size. + */ +static ssize_t btrfs_force_chunk_alloc_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + struct btrfs_fs_info *fs_info = to_fs_info(get_btrfs_kobj(kobj)); + struct btrfs_trans_handle *trans; + bool val; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (sb_rdonly(fs_info->sb)) + return -EROFS; + + ret = kstrtobool(buf, &val); + if (ret) + return ret; + + if (!val) + return -EINVAL; + + /* + * This is unsafe to be called from sysfs context and may cause + * unexpected problems. + */ + trans = btrfs_start_transaction(fs_info->tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + ret = btrfs_force_chunk_alloc(trans, space_info->flags); + btrfs_end_transaction(trans); + + if (ret == 1) + return len; + + return -ENOSPC; +} +BTRFS_ATTR_W(space_info, force_chunk_alloc, btrfs_force_chunk_alloc_store); + +#endif + SPACE_INFO_ATTR(flags); SPACE_INFO_ATTR(total_bytes); SPACE_INFO_ATTR(bytes_used); @@ -721,6 +830,43 @@ SPACE_INFO_ATTR(bytes_readonly); SPACE_INFO_ATTR(bytes_zone_unusable); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); +BTRFS_ATTR_RW(space_info, chunk_size, btrfs_chunk_size_show, btrfs_chunk_size_store); + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + ssize_t ret; + + ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold)); + + return ret; +} + +static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_space_info *space_info = to_space_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh < 0 || thresh > 100) + return -EINVAL; + + WRITE_ONCE(space_info->bg_reclaim_threshold, thresh); + + return len; +} + +BTRFS_ATTR_RW(space_info, bg_reclaim_threshold, + btrfs_sinfo_bg_reclaim_threshold_show, + btrfs_sinfo_bg_reclaim_threshold_store); /* * Allocation information about block group types. @@ -738,6 +884,11 @@ static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(space_info, bytes_zone_unusable), BTRFS_ATTR_PTR(space_info, disk_used), BTRFS_ATTR_PTR(space_info, disk_total), + BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold), + BTRFS_ATTR_PTR(space_info, chunk_size), +#ifdef CONFIG_BTRFS_DEBUG + BTRFS_ATTR_PTR(space_info, force_chunk_alloc), +#endif NULL, }; ATTRIBUTE_GROUPS(space_info); @@ -836,6 +987,48 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, BTRFS_ATTR(, sectorsize, btrfs_sectorsize_show); +static ssize_t btrfs_commit_stats_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + + return sysfs_emit(buf, + "commits %llu\n" + "last_commit_ms %llu\n" + "max_commit_ms %llu\n" + "total_commit_ms %llu\n", + fs_info->commit_stats.commit_count, + div_u64(fs_info->commit_stats.last_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.max_commit_dur, NSEC_PER_MSEC), + div_u64(fs_info->commit_stats.total_commit_dur, NSEC_PER_MSEC)); +} + +static ssize_t btrfs_commit_stats_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + unsigned long val; + int ret; + + if (!fs_info) + return -EPERM; + + if (!capable(CAP_SYS_RESOURCE)) + return -EPERM; + + ret = kstrtoul(buf, 10, &val); + if (ret) + return ret; + if (val) + return -EINVAL; + + WRITE_ONCE(fs_info->commit_stats.max_commit_dur, 0); + + return len; +} +BTRFS_ATTR_RW(, commit_stats, btrfs_commit_stats_show, btrfs_commit_stats_store); + static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { @@ -1075,6 +1268,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), BTRFS_ATTR_PTR(, bg_reclaim_threshold), + BTRFS_ATTR_PTR(, commit_stats), NULL, }; @@ -1105,6 +1299,16 @@ static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) return to_fs_devs(kobj)->fs_info; } +static struct kobject *get_btrfs_kobj(struct kobject *kobj) +{ + while (kobj) { + if (kobj->ktype == &btrfs_ktype) + return kobj; + kobj = kobj->parent; + } + return NULL; +} + #define NUM_FEATURE_BITS 64 #define BTRFS_FEATURE_NAME_MAX 13 static char btrfs_unknown_feature_names[FEAT_MAX][NUM_FEATURE_BITS][BTRFS_FEATURE_NAME_MAX]; @@ -2071,4 +2275,3 @@ void __cold btrfs_exit_sysfs(void) #endif kset_unregister(btrfs_kset); } - diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index d8e56edd6991..cc9377cf56a3 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -59,6 +59,7 @@ struct inode *btrfs_new_test_inode(void) return NULL; inode->i_mode = S_IFREG; + inode->i_ino = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; BTRFS_I(inode)->location.objectid = BTRFS_FIRST_FREE_OBJECTID; BTRFS_I(inode)->location.offset = 0; diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index 51a8b075c259..b7d181a08eab 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -47,7 +47,8 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) goto out; } - path->nodes[0] = eb = alloc_dummy_extent_buffer(fs_info, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, nodesize); + path->nodes[0] = eb; if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index b008c5110958..0bec10740ad3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -10,6 +10,7 @@ #include <linux/pagemap.h> #include <linux/blkdev.h> #include <linux/uuid.h> +#include <linux/timekeeping.h> #include "misc.h" #include "ctree.h" #include "disk-io.h" @@ -221,7 +222,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) * the caching thread will re-start it's search from 3, and thus find * the hole from [4,6) to add to the free space cache. */ - spin_lock(&fs_info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); list_for_each_entry_safe(caching_ctl, next, &fs_info->caching_block_groups, list) { struct btrfs_block_group *cache = caching_ctl->block_group; @@ -234,7 +235,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) cache->last_byte_to_unpin = caching_ctl->progress; } } - spin_unlock(&fs_info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); up_write(&fs_info->commit_root_sem); } @@ -1831,8 +1832,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + dentry->d_name.len * 2); - parent_inode->i_mtime = parent_inode->i_ctime = - current_time(parent_inode); + parent_inode->i_mtime = current_time(parent_inode); + parent_inode->i_ctime = parent_inode->i_mtime; ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode)); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2098,12 +2099,23 @@ static void add_pending_snapshot(struct btrfs_trans_handle *trans) list_add(&trans->pending_snapshot->list, &cur_trans->pending_snapshots); } +static void update_commit_stats(struct btrfs_fs_info *fs_info, ktime_t interval) +{ + fs_info->commit_stats.commit_count++; + fs_info->commit_stats.last_commit_dur = interval; + fs_info->commit_stats.max_commit_dur = + max_t(u64, fs_info->commit_stats.max_commit_dur, interval); + fs_info->commit_stats.total_commit_dur += interval; +} + int btrfs_commit_transaction(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; int ret; + ktime_t start_time; + ktime_t interval; ASSERT(refcount_read(&trans->use_count) == 1); @@ -2228,6 +2240,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) } } + /* + * Get the time spent on the work done by the commit thread and not + * the time spent waiting on a previous commit + */ + start_time = ktime_get_ns(); + extwriter_counter_dec(cur_trans, trans->type); ret = btrfs_start_delalloc_flush(fs_info); @@ -2469,6 +2487,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) trace_btrfs_transaction_commit(fs_info); + interval = ktime_get_ns() - start_time; + btrfs_scrub_continue(fs_info); if (current->journal_info == trans) @@ -2476,6 +2496,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) kmem_cache_free(btrfs_trans_handle_cachep, trans); + update_commit_stats(fs_info, interval); + return ret; unlock_reloc: diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index e56c0107eea3..9e0e0ae2288c 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1855,3 +1855,58 @@ out: return ret; } ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO); + +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner) +{ + const bool is_subvol = is_fstree(root_owner); + const u64 eb_owner = btrfs_header_owner(eb); + + /* + * Skip dummy fs, as selftests don't create unique ebs for each dummy + * root. + */ + if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state)) + return 0; + /* + * There are several call sites (backref walking, qgroup, and data + * reloc) passing 0 as @root_owner, as they are not holding the + * tree root. In that case, we can not do a reliable ownership check, + * so just exit. + */ + if (root_owner == 0) + return 0; + /* + * These trees use key.offset as their owner, our callers don't have + * the extra capacity to pass key.offset here. So we just skip them. + */ + if (root_owner == BTRFS_TREE_LOG_OBJECTID || + root_owner == BTRFS_TREE_RELOC_OBJECTID) + return 0; + + if (!is_subvol) { + /* For non-subvolume trees, the eb owner should match root owner */ + if (unlikely(root_owner != eb_owner)) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + root_owner); + return -EUCLEAN; + } + return 0; + } + + /* + * For subvolume trees, owners can mismatch, but they should all belong + * to subvolume trees. + */ + if (unlikely(is_subvol != is_fstree(eb_owner))) { + btrfs_crit(eb->fs_info, +"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]", + btrfs_header_level(eb) == 0 ? "leaf" : "node", + root_owner, btrfs_header_bytenr(eb), eb_owner, + BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID); + return -EUCLEAN; + } + return 0; +} diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 32fecc9dc1dd..ece497e26558 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node); int btrfs_check_chunk_valid(struct extent_buffer *leaf, struct btrfs_chunk *chunk, u64 logical); +int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e65633686378..dcf75a8daa20 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -171,7 +171,7 @@ again: int index = (root->log_transid + 1) % 2; if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -194,7 +194,7 @@ again: * writing. */ if (zoned && !created) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -333,7 +333,7 @@ static int process_one_buffer(struct btrfs_root *log, * pin down any logged extents, so we have to read the block. */ if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) { - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; } @@ -894,8 +894,7 @@ update_inode: btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); out: - if (inode) - iput(inode); + iput(inode); return ret; } @@ -2288,7 +2287,7 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_key location; /* - * Currenly we only log dir index keys. Even if we replay a log created + * Currently we only log dir index keys. Even if we replay a log created * by an older kernel that logged both dir index and dir item keys, all * we need to do is process the dir index keys, we (and our caller) can * safely ignore dir item keys (key type BTRFS_DIR_ITEM_KEY). @@ -2575,7 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, int i; int ret; - ret = btrfs_read_buffer(eb, gen, level, NULL); + ret = btrfs_read_extent_buffer(eb, gen, level, NULL); if (ret) return ret; @@ -2786,7 +2785,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, path->slots[*level]++; if (wc->free) { - ret = btrfs_read_buffer(next, ptr_gen, + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); @@ -2815,7 +2814,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, free_extent_buffer(next); continue; } - ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key); + ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key); if (ret) { free_extent_buffer(next); return ret; @@ -3122,7 +3121,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, /* bail out if we need to do a full commit */ if (btrfs_need_log_full_commit(trans)) { - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; mutex_unlock(&root->log_mutex); goto out; } @@ -3223,7 +3222,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, } btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out; } @@ -3262,7 +3261,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, blk_finish_plug(&plug); btrfs_wait_tree_log_extents(log, mark); mutex_unlock(&log_root_tree->log_mutex); - ret = -EAGAIN; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_wake_log_root; } @@ -5849,7 +5848,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, inode_only == LOG_INODE_ALL && inode->last_unlink_trans >= trans->transid) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto out_unlock; } @@ -6563,12 +6562,12 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, bool log_dentries = false; if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } if (btrfs_root_refs(&root->root_item) == 0) { - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; goto end_no_trans; } @@ -6666,7 +6665,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); - ret = 1; + ret = BTRFS_LOG_FORCE_COMMIT; } if (ret) @@ -7030,8 +7029,15 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * anyone from syncing the log until we have updated both inodes * in the log. */ + ret = join_running_log_trans(root); + /* + * At least one of the inodes was logged before, so this should + * not fail, but if it does, it's not serious, just bail out and + * mark the log for a full commit. + */ + if (WARN_ON_ONCE(ret < 0)) + goto out; log_pinned = true; - btrfs_pin_log_trans(root); path = btrfs_alloc_path(); if (!path) { diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 1620f8170629..57ab5f3b8dc7 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -12,6 +12,9 @@ /* return value for btrfs_log_dentry_safe that means we don't need to log it at all */ #define BTRFS_NO_LOG_SYNC 256 +/* We can't use the tree log for whatever reason, force a transaction commit */ +#define BTRFS_LOG_FORCE_COMMIT (1) + struct btrfs_log_ctx { int log_ret; int log_transid; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b6b00338037c..272901514b0c 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -164,24 +164,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { */ enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags) { - if (flags & BTRFS_BLOCK_GROUP_RAID10) - return BTRFS_RAID_RAID10; - else if (flags & BTRFS_BLOCK_GROUP_RAID1) - return BTRFS_RAID_RAID1; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C3) - return BTRFS_RAID_RAID1C3; - else if (flags & BTRFS_BLOCK_GROUP_RAID1C4) - return BTRFS_RAID_RAID1C4; - else if (flags & BTRFS_BLOCK_GROUP_DUP) - return BTRFS_RAID_DUP; - else if (flags & BTRFS_BLOCK_GROUP_RAID0) - return BTRFS_RAID_RAID0; - else if (flags & BTRFS_BLOCK_GROUP_RAID5) - return BTRFS_RAID_RAID5; - else if (flags & BTRFS_BLOCK_GROUP_RAID6) - return BTRFS_RAID_RAID6; - - return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */ + const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK); + + if (!profile) + return BTRFS_RAID_SINGLE; + + return BTRFS_BG_FLAG_TO_INDEX(profile); } const char *btrfs_bg_type_to_raid_name(u64 flags) @@ -194,6 +182,13 @@ const char *btrfs_bg_type_to_raid_name(u64 flags) return btrfs_raid_array[index].raid_name; } +int btrfs_nr_parity_stripes(u64 type) +{ + enum btrfs_raid_types index = btrfs_bg_flags_to_raid_index(type); + + return btrfs_raid_array[index].nparity; +} + /* * Fill @buf with textual description of @bg_flags, no more than @size_buf * bytes including terminating null byte. @@ -250,7 +245,6 @@ out_overflow:; static int init_first_rw_device(struct btrfs_trans_handle *trans); static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info); -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); static int __btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, @@ -1408,12 +1402,7 @@ static u64 dev_extent_search_start(struct btrfs_device *device, u64 start) { switch (device->fs_devices->chunk_alloc_policy) { case BTRFS_CHUNK_ALLOC_REGULAR: - /* - * We don't want to overwrite the superblock on the drive nor - * any area used by the boot loader (grub for example), so we - * make sure to start at an offset of at least 1MB. - */ - return max_t(u64, start, SZ_1M); + return max_t(u64, start, BTRFS_DEVICE_RANGE_RESERVED); case BTRFS_CHUNK_ALLOC_ZONED: /* * We don't care about the starting region like regular @@ -4062,13 +4051,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info, if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) return true; - if (fs_info->sectorsize < PAGE_SIZE && - bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) { - btrfs_err(fs_info, - "RAID56 is not yet supported for sectorsize %u with page size %lu", - fs_info->sectorsize, PAGE_SIZE); - return false; - } /* Profile is valid and does not have bits outside of the allowed set */ if (alloc_profile_is_valid(bargs->target, 1) && (bargs->target & ~allowed) == 0) @@ -5090,26 +5072,16 @@ static void init_alloc_chunk_ctl_policy_regular( struct btrfs_fs_devices *fs_devices, struct alloc_chunk_ctl *ctl) { - u64 type = ctl->type; + struct btrfs_space_info *space_info; - if (type & BTRFS_BLOCK_GROUP_DATA) { - ctl->max_stripe_size = SZ_1G; - ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE; - } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - /* For larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) - ctl->max_stripe_size = SZ_1G; - else - ctl->max_stripe_size = SZ_256M; - ctl->max_chunk_size = ctl->max_stripe_size; - } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - ctl->max_stripe_size = SZ_32M; - ctl->max_chunk_size = 2 * ctl->max_stripe_size; - ctl->devs_max = min_t(int, ctl->devs_max, - BTRFS_MAX_DEVS_SYS_CHUNK); - } else { - BUG(); - } + space_info = btrfs_find_space_info(fs_devices->fs_info, ctl->type); + ASSERT(space_info); + + ctl->max_chunk_size = READ_ONCE(space_info->chunk_size); + ctl->max_stripe_size = ctl->max_chunk_size; + + if (ctl->type & BTRFS_BLOCK_GROUP_SYSTEM) + ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); /* We don't want a chunk larger than 10% of writable space */ ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), @@ -5739,7 +5711,8 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { struct extent_map *em; struct map_lookup *map; - int ret; + enum btrfs_raid_types index; + int ret = 1; em = btrfs_get_chunk_map(fs_info, logical, len); if (IS_ERR(em)) @@ -5752,10 +5725,11 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; map = em->map_lookup; - if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK)) - ret = map->num_stripes; - else if (map->type & BTRFS_BLOCK_GROUP_RAID10) - ret = map->sub_stripes; + index = btrfs_bg_flags_to_raid_index(map->type); + + /* Non-RAID56, use their ncopies from btrfs_raid_array. */ + if (!(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)) + ret = btrfs_raid_array[index].ncopies; else if (map->type & BTRFS_BLOCK_GROUP_RAID5) ret = 2; else if (map->type & BTRFS_BLOCK_GROUP_RAID6) @@ -5767,8 +5741,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - else - ret = 1; free_extent_map(em); down_read(&fs_info->dev_replace.rwsem); @@ -5787,6 +5759,9 @@ unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, struct map_lookup *map; unsigned long len = fs_info->sectorsize; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return len; + em = btrfs_get_chunk_map(fs_info, logical, len); if (!WARN_ON(IS_ERR(em))) { @@ -5804,6 +5779,9 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) struct map_lookup *map; int ret = 0; + if (!btrfs_fs_incompat(fs_info, RAID56)) + return 0; + em = btrfs_get_chunk_map(fs_info, logical, len); if(!WARN_ON(IS_ERR(em))) { @@ -5936,18 +5914,17 @@ void btrfs_put_bioc(struct btrfs_io_context *bioc) kfree(bioc); } -/* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */ /* * Please note that, discard won't be sent to target device of device * replace. */ -static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, - u64 logical, u64 *length_ret, - struct btrfs_io_context **bioc_ret) +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes) { struct extent_map *em; struct map_lookup *map; - struct btrfs_io_context *bioc; + struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; u64 stripe_nr; @@ -5956,29 +5933,26 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, u64 stripe_cnt; u64 stripe_len; u64 stripe_offset; - u64 num_stripes; u32 stripe_index; u32 factor = 0; u32 sub_stripes = 0; u64 stripes_per_dev = 0; u32 remaining_stripes = 0; u32 last_stripe = 0; - int ret = 0; + int ret; int i; - /* Discard always returns a bioc. */ - ASSERT(bioc_ret); - em = btrfs_get_chunk_map(fs_info, logical, length); if (IS_ERR(em)) - return PTR_ERR(em); + return ERR_CAST(em); map = em->map_lookup; + /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { ret = -EOPNOTSUPP; - goto out; - } + goto out_free_map; +} offset = logical - em->start; length = min_t(u64, em->start + em->len - logical, length); @@ -6004,7 +5978,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - num_stripes = 1; + *num_stripes = 1; stripe_index = 0; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { @@ -6014,7 +5988,7 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, sub_stripes = map->sub_stripes; factor = map->num_stripes / sub_stripes; - num_stripes = min_t(u64, map->num_stripes, + *num_stripes = min_t(u64, map->num_stripes, sub_stripes * stripe_cnt); stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index); stripe_index *= sub_stripes; @@ -6024,31 +5998,30 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, last_stripe *= sub_stripes; } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_DUP)) { - num_stripes = map->num_stripes; + *num_stripes = map->num_stripes; } else { stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &stripe_index); } - bioc = alloc_btrfs_io_context(fs_info, num_stripes, 0); - if (!bioc) { + stripes = kcalloc(*num_stripes, sizeof(*stripes), GFP_NOFS); + if (!stripes) { ret = -ENOMEM; - goto out; + goto out_free_map; } - for (i = 0; i < num_stripes; i++) { - bioc->stripes[i].physical = + for (i = 0; i < *num_stripes; i++) { + stripes[i].physical = map->stripes[stripe_index].physical + stripe_offset + stripe_nr * map->stripe_len; - bioc->stripes[i].dev = map->stripes[stripe_index].dev; + stripes[i].dev = map->stripes[stripe_index].dev; if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) { - bioc->stripes[i].length = stripes_per_dev * - map->stripe_len; + stripes[i].length = stripes_per_dev * map->stripe_len; if (i / sub_stripes < remaining_stripes) - bioc->stripes[i].length += map->stripe_len; + stripes[i].length += map->stripe_len; /* * Special for the first stripe and @@ -6059,17 +6032,17 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, * off end_off */ if (i < sub_stripes) - bioc->stripes[i].length -= stripe_offset; + stripes[i].length -= stripe_offset; if (stripe_index >= last_stripe && stripe_index <= (last_stripe + sub_stripes - 1)) - bioc->stripes[i].length -= stripe_end_offset; + stripes[i].length -= stripe_end_offset; if (i == sub_stripes - 1) stripe_offset = 0; } else { - bioc->stripes[i].length = length; + stripes[i].length = length; } stripe_index++; @@ -6079,12 +6052,11 @@ static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info, } } - *bioc_ret = bioc; - bioc->map_type = map->type; - bioc->num_stripes = num_stripes; -out: free_extent_map(em); - return ret; + return stripes; +out_free_map: + free_extent_map(em); + return ERR_PTR(ret); } /* @@ -6227,7 +6199,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + i; new->physical = old->physical; - new->length = old->length; new->dev = dev_replace->tgtdev; bioc->tgtdev_map[i] = index_where_to_add; index_where_to_add++; @@ -6268,8 +6239,6 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->stripes + num_stripes; tgtdev_stripe->physical = physical_of_found; - tgtdev_stripe->length = - bioc->stripes[index_srcdev].length; tgtdev_stripe->dev = dev_replace->tgtdev; bioc->tgtdev_map[index_srcdev] = num_stripes; @@ -6312,7 +6281,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, u64 offset; u64 stripe_offset; u64 stripe_nr; - u64 stripe_len; + u32 stripe_len; u64 raid56_full_stripe_start = (u64)-1; int data_stripes; @@ -6323,19 +6292,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em, offset = logical - em->start; /* Len of a stripe in a chunk */ stripe_len = map->stripe_len; - /* Stripe where this block falls in */ - stripe_nr = div64_u64(offset, stripe_len); - /* Offset of stripe in the chunk */ - stripe_offset = stripe_nr * stripe_len; - if (offset < stripe_offset) { - btrfs_crit(fs_info, -"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu", - stripe_offset, offset, em->start, logical, stripe_len); - return -EINVAL; - } + /* + * Stripe_nr is where this block falls in + * stripe_offset is the offset of this block in its stripe. + */ + stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset); + ASSERT(stripe_offset < U32_MAX); - /* stripe_offset is the offset of this block in its stripe */ - stripe_offset = offset - stripe_offset; data_stripes = nr_data_stripes(map); /* Only stripe based profiles needs to check against stripe length. */ @@ -6497,6 +6460,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, } } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ASSERT(map->stripe_len == BTRFS_STRIPE_LEN); if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) { /* push stripe_nr back to the start of the full stripe */ stripe_nr = div64_u64(raid56_full_stripe_start, @@ -6504,9 +6468,12 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, /* RAID[56] write or recovery. Return all stripes */ num_stripes = map->num_stripes; - max_errors = nr_parity_stripes(map); + max_errors = btrfs_chunk_max_errors(map); - *length = map->stripe_len; + /* Return the length to the full stripe end */ + *length = min(logical + *length, + raid56_full_stripe_start + em->start + + data_stripes * stripe_len) - logical; stripe_index = 0; stripe_offset = 0; } else { @@ -6629,10 +6596,6 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret, int mirror_num) { - if (op == BTRFS_MAP_DISCARD) - return __btrfs_map_block_for_discard(fs_info, logical, - length, bioc_ret); - return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, mirror_num, 0); } @@ -6645,77 +6608,106 @@ int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, return __btrfs_map_block(fs_info, op, logical, length, bioc_ret, 0, 1); } -static inline void btrfs_end_bioc(struct btrfs_io_context *bioc, struct bio *bio) +static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_io_context *bioc) +{ + if (bioc->orig_bio->bi_opf & REQ_META) + return bioc->fs_info->endio_meta_workers; + return bioc->fs_info->endio_workers; +} + +static void btrfs_end_bio_work(struct work_struct *work) { - bio->bi_private = bioc->private; - bio->bi_end_io = bioc->end_io; - bio_endio(bio); + struct btrfs_bio *bbio = + container_of(work, struct btrfs_bio, end_io_work); + + bio_endio(&bbio->bio); +} + +static void btrfs_end_bioc(struct btrfs_io_context *bioc, bool async) +{ + struct bio *orig_bio = bioc->orig_bio; + struct btrfs_bio *bbio = btrfs_bio(orig_bio); + + bbio->mirror_num = bioc->mirror_num; + orig_bio->bi_private = bioc->private; + orig_bio->bi_end_io = bioc->end_io; + + /* + * Only send an error to the higher layers if it is beyond the tolerance + * threshold. + */ + if (atomic_read(&bioc->error) > bioc->max_errors) + orig_bio->bi_status = BLK_STS_IOERR; + else + orig_bio->bi_status = BLK_STS_OK; + + if (btrfs_op(orig_bio) == BTRFS_MAP_READ && async) { + INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work); + queue_work(btrfs_end_io_wq(bioc), &bbio->end_io_work); + } else { + bio_endio(orig_bio); + } btrfs_put_bioc(bioc); } static void btrfs_end_bio(struct bio *bio) { - struct btrfs_io_context *bioc = bio->bi_private; - int is_orig_bio = 0; + struct btrfs_io_stripe *stripe = bio->bi_private; + struct btrfs_io_context *bioc = stripe->bioc; if (bio->bi_status) { atomic_inc(&bioc->error); if (bio->bi_status == BLK_STS_IOERR || bio->bi_status == BLK_STS_TARGET) { - struct btrfs_device *dev = btrfs_bio(bio)->device; - - ASSERT(dev->bdev); if (btrfs_op(bio) == BTRFS_MAP_WRITE) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_WRITE_ERRS); else if (!(bio->bi_opf & REQ_RAHEAD)) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); if (bio->bi_opf & REQ_PREFLUSH) - btrfs_dev_stat_inc_and_print(dev, + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_FLUSH_ERRS); } } - if (bio == bioc->orig_bio) - is_orig_bio = 1; + if (bio != bioc->orig_bio) + bio_put(bio); btrfs_bio_counter_dec(bioc->fs_info); - - if (atomic_dec_and_test(&bioc->stripes_pending)) { - if (!is_orig_bio) { - bio_put(bio); - bio = bioc->orig_bio; - } - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - /* only send an error to the higher layers if it is - * beyond the tolerance of the btrfs bio - */ - if (atomic_read(&bioc->error) > bioc->max_errors) { - bio->bi_status = BLK_STS_IOERR; - } else { - /* - * this bio is actually up to date, we didn't - * go over the max number of errors - */ - bio->bi_status = BLK_STS_OK; - } - - btrfs_end_bioc(bioc, bio); - } else if (!is_orig_bio) { - bio_put(bio); - } + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, true); } -static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, - u64 physical, struct btrfs_device *dev) +static void submit_stripe_bio(struct btrfs_io_context *bioc, + struct bio *orig_bio, int dev_nr, bool clone) { struct btrfs_fs_info *fs_info = bioc->fs_info; + struct btrfs_device *dev = bioc->stripes[dev_nr].dev; + u64 physical = bioc->stripes[dev_nr].physical; + struct bio *bio; + + if (!dev || !dev->bdev || + test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) || + (btrfs_op(orig_bio) == BTRFS_MAP_WRITE && + !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { + atomic_inc(&bioc->error); + if (atomic_dec_and_test(&bioc->stripes_pending)) + btrfs_end_bioc(bioc, false); + return; + } + + if (clone) { + bio = bio_alloc_clone(dev->bdev, orig_bio, GFP_NOFS, &fs_bio_set); + } else { + bio = orig_bio; + bio_set_dev(bio, dev->bdev); + btrfs_bio(bio)->device = dev; + } - bio->bi_private = bioc; - btrfs_bio(bio)->device = dev; + bioc->stripes[dev_nr].bioc = bioc; + bio->bi_private = &bioc->stripes[dev_nr]; bio->bi_end_io = btrfs_end_bio; bio->bi_iter.bi_sector = physical >> 9; /* @@ -6733,77 +6725,50 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio, } } btrfs_debug_in_rcu(fs_info, - "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", - bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, + "%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u", + __func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector, (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name), dev->devid, bio->bi_iter.bi_size); - bio_set_dev(bio, dev->bdev); btrfs_bio_counter_inc_noblocked(fs_info); - btrfsic_submit_bio(bio); + btrfsic_check_bio(bio); + submit_bio(bio); } -static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical) +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num) { - atomic_inc(&bioc->error); - if (atomic_dec_and_test(&bioc->stripes_pending)) { - /* Should be the original bio. */ - WARN_ON(bio != bioc->orig_bio); - - btrfs_bio(bio)->mirror_num = bioc->mirror_num; - bio->bi_iter.bi_sector = logical >> 9; - if (atomic_read(&bioc->error) > bioc->max_errors) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_status = BLK_STS_OK; - btrfs_end_bioc(bioc, bio); - } -} - -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num) -{ - struct btrfs_device *dev; - struct bio *first_bio = bio; u64 logical = bio->bi_iter.bi_sector << 9; - u64 length = 0; - u64 map_length; + u64 length = bio->bi_iter.bi_size; + u64 map_length = length; int ret; int dev_nr; int total_devs; struct btrfs_io_context *bioc = NULL; - length = bio->bi_iter.bi_size; - map_length = length; - btrfs_bio_counter_inc_blocked(fs_info); ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, &bioc, mirror_num, 1); if (ret) { btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + bio->bi_status = errno_to_blk_status(ret); + bio_endio(bio); + return; } total_devs = bioc->num_stripes; - bioc->orig_bio = first_bio; - bioc->private = first_bio->bi_private; - bioc->end_io = first_bio->bi_end_io; - atomic_set(&bioc->stripes_pending, bioc->num_stripes); + bioc->orig_bio = bio; + bioc->private = bio->bi_private; + bioc->end_io = bio->bi_end_io; + atomic_set(&bioc->stripes_pending, total_devs); if ((bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && ((btrfs_op(bio) == BTRFS_MAP_WRITE) || (mirror_num > 1))) { - /* In this case, map_length has been set to the length of - a single stripe; not the whole write */ - if (btrfs_op(bio) == BTRFS_MAP_WRITE) { - ret = raid56_parity_write(bio, bioc, map_length); - } else { - ret = raid56_parity_recover(bio, bioc, map_length, - mirror_num, 1); - } - - btrfs_bio_counter_dec(fs_info); - return errno_to_blk_status(ret); + if (btrfs_op(bio) == BTRFS_MAP_WRITE) + raid56_parity_write(bio, bioc); + else + raid56_parity_recover(bio, bioc, mirror_num, true); + return; } if (map_length < length) { @@ -6814,24 +6779,11 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, } for (dev_nr = 0; dev_nr < total_devs; dev_nr++) { - dev = bioc->stripes[dev_nr].dev; - if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING, - &dev->dev_state) || - (btrfs_op(first_bio) == BTRFS_MAP_WRITE && - !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) { - bioc_error(bioc, first_bio, logical); - continue; - } - - if (dev_nr < total_devs - 1) - bio = btrfs_bio_clone(first_bio); - else - bio = first_bio; + const bool should_clone = (dev_nr < total_devs - 1); - submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev); + submit_stripe_bio(bioc, bio, dev_nr, should_clone); } btrfs_bio_counter_dec(fs_info); - return BLK_STS_OK; } static bool dev_args_match_fs_devices(const struct btrfs_dev_lookup_args *args, @@ -6989,11 +6941,12 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) +u64 btrfs_calc_stripe_length(const struct extent_map *em) { - const int data_stripes = calc_data_stripes(type, num_stripes); + const struct map_lookup *map = em->map_lookup; + const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(chunk_len, data_stripes); + return div_u64(em->len, data_stripes); } #if BITS_PER_LONG == 32 @@ -7132,8 +7085,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(type, em->len, - map->num_stripes); + em->orig_block_len = btrfs_calc_stripe_length(em); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -7259,7 +7211,8 @@ static int read_one_dev(struct extent_buffer *leaf, u8 fs_uuid[BTRFS_FSID_SIZE]; u8 dev_uuid[BTRFS_UUID_SIZE]; - devid = args.devid = btrfs_device_id(leaf, dev_item); + devid = btrfs_device_id(leaf, dev_item); + args.devid = devid; read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item), BTRFS_UUID_SIZE); read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item), @@ -7359,7 +7312,6 @@ static int read_one_dev(struct extent_buffer *leaf, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) { - struct btrfs_root *root = fs_info->tree_root; struct btrfs_super_block *super_copy = fs_info->super_copy; struct extent_buffer *sb; struct btrfs_disk_key *disk_key; @@ -7375,30 +7327,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info) struct btrfs_key key; ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize); + /* - * This will create extent buffer of nodesize, superblock size is - * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will - * overallocate but we can keep it as-is, only the first page is used. + * We allocated a dummy extent, just to use extent buffer accessors. + * There will be unused space after BTRFS_SUPER_INFO_SIZE, but + * that's fine, we will not go beyond system chunk array anyway. */ - sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET, - root->root_key.objectid, 0); - if (IS_ERR(sb)) - return PTR_ERR(sb); + sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET); + if (!sb) + return -ENOMEM; set_extent_buffer_uptodate(sb); - /* - * The sb extent buffer is artificial and just used to read the system array. - * set_extent_buffer_uptodate() call does not properly mark all it's - * pages up-to-date when the page is larger: extent does not cover the - * whole page and consequently check_page_uptodate does not find all - * the page's extents up-to-date (the hole beyond sb), - * write_extent_buffer then triggers a WARN_ON. - * - * Regular short extents go through mark_extent_buffer_dirty/writeback cycle, - * but sb spans only this function. Add an explicit SetPageUptodate call - * to silence the warning eg. on PowerPC 64. - */ - if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE) - SetPageUptodate(sb->pages[0]); write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); @@ -7561,6 +7499,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) struct btrfs_key found_key; int ret; int slot; + int iter_ret = 0; u64 total_dev = 0; u64 last_ra_node = 0; @@ -7604,30 +7543,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.offset = 0; key.type = 0; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto error; - while (1) { - struct extent_buffer *node; + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { + struct extent_buffer *node = path->nodes[1]; leaf = path->nodes[0]; slot = path->slots[0]; - if (slot >= btrfs_header_nritems(leaf)) { - ret = btrfs_next_leaf(root, path); - if (ret == 0) - continue; - if (ret < 0) - goto error; - break; - } - node = path->nodes[1]; + if (node) { if (last_ra_node != node->start) { readahead_tree_node_children(node); last_ra_node = node->start; } } - btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.type == BTRFS_DEV_ITEM_KEY) { struct btrfs_dev_item *dev_item; dev_item = btrfs_item_ptr(leaf, slot, @@ -7652,7 +7579,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) if (ret) goto error; } - path->slots[0]++; + } + /* Catch error found during iteration */ + if (iter_ret < 0) { + ret = iter_ret; + goto error; } /* @@ -7660,12 +7591,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * do another round of validation checks. */ if (total_dev != fs_info->fs_devices->total_devices) { - btrfs_err(fs_info, - "super_num_devices %llu mismatch with num_devices %llu found here", + btrfs_warn(fs_info, +"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit", btrfs_super_num_devices(fs_info->super_copy), total_dev); - ret = -EINVAL; - goto error; + fs_info->fs_devices->total_devices = total_dev; + btrfs_set_super_num_devices(fs_info->super_copy, total_dev); } if (btrfs_super_total_bytes(fs_info->super_copy) < fs_info->fs_devices->total_rw_bytes) { @@ -7910,11 +7841,7 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans) void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index) { btrfs_dev_stat_inc(dev, index); - btrfs_dev_stat_print_on_error(dev); -} -static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev) -{ if (!dev->dev_stats_valid) return; btrfs_err_rl_in_rcu(dev->fs_info, @@ -8056,7 +7983,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } map = em->map_lookup; - stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes); + stripe_len = btrfs_calc_stripe_length(em); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", @@ -8066,6 +7993,16 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } + /* + * Very old mkfs.btrfs (before v4.1) will not respect the reserved + * space. Although kernel can handle it without problem, better to warn + * the users. + */ + if (physical_offset < BTRFS_DEVICE_RANGE_RESERVED) + btrfs_warn(fs_info, + "devid %llu physical %llu len %llu inside the reserved space", + devid, physical_offset, physical_len); + for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->devid == devid && map->stripes[i].physical == physical_offset) { @@ -8277,7 +8214,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr) static int relocating_repair_kthread(void *data) { - struct btrfs_block_group *cache = (struct btrfs_block_group *)data; + struct btrfs_block_group *cache = data; struct btrfs_fs_info *fs_info = cache->fs_info; u64 target; int ret = 0; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index b11c563d2025..5639961b3626 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -17,17 +17,51 @@ extern struct mutex uuid_mutex; #define BTRFS_STRIPE_LEN SZ_64K +/* Used by sanity check for btrfs_raid_types. */ +#define const_ffs(n) (__builtin_ctzll(n) + 1) + +/* + * The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires + * RAID0 always to be the lowest profile bit. + * Although it's part of on-disk format and should never change, do extra + * compile-time sanity checks. + */ +static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) < + const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0)); +static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) > + ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK)); + +/* ilog2() can handle both constants and variables */ +#define BTRFS_BG_FLAG_TO_INDEX(profile) \ + ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1)) + +enum btrfs_raid_types { + /* SINGLE is the special one as it doesn't have on-disk bit. */ + BTRFS_RAID_SINGLE = 0, + + BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0), + BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1), + BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP), + BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10), + BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5), + BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6), + BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3), + BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4), + + BTRFS_NR_RAID_TYPES +}; + struct btrfs_io_geometry { /* remaining bytes before crossing a stripe */ u64 len; /* offset of logical address in chunk */ u64 offset; /* length of single IO stripe */ - u64 stripe_len; + u32 stripe_len; + /* offset of address in stripe */ + u32 stripe_offset; /* number of stripe where address falls */ u64 stripe_nr; - /* offset of address in stripe */ - u64 stripe_offset; /* offset of raid56 stripe into the chunk */ u64 raid56_stripe_offset; }; @@ -321,6 +355,13 @@ struct btrfs_fs_devices { / sizeof(struct btrfs_stripe) + 1) /* + * Maximum number of sectors for a single bio to limit the size of the + * checksum array. This matches the number of bio_vecs per bio and thus the + * I/O size for buffered I/O. + */ +#define BTRFS_MAX_BIO_SECTORS (256) + +/* * Additional info to pass along bio. * * Mostly for btrfs specific features like csum and mirror_num. @@ -337,6 +378,9 @@ struct btrfs_bio { u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; struct bvec_iter iter; + /* For read end I/O handling */ + struct work_struct end_io_work; + /* * This member must come last, bio_alloc_bioset will allocate enough * bytes for entire btrfs_bio but relies on bio being last. @@ -357,10 +401,36 @@ static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio) } } +/* + * Iterate through a btrfs_bio (@bbio) on a per-sector basis. + * + * bvl - struct bio_vec + * bbio - struct btrfs_bio + * iters - struct bvec_iter + * bio_offset - unsigned int + */ +#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \ + for ((iter) = (bbio)->iter, (bio_offset) = 0; \ + (iter).bi_size && \ + (((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \ + (bio_offset) += fs_info->sectorsize, \ + bio_advance_iter_single(&(bbio)->bio, &(iter), \ + (fs_info)->sectorsize)) + struct btrfs_io_stripe { struct btrfs_device *dev; + union { + /* Block mapping */ + u64 physical; + /* For the endio handler */ + struct btrfs_io_context *bioc; + }; +}; + +struct btrfs_discard_stripe { + struct btrfs_device *dev; u64 physical; - u64 length; /* only used for discard mappings */ + u64 length; }; /* @@ -430,7 +500,7 @@ struct map_lookup { u64 type; int io_align; int io_width; - u64 stripe_len; + u32 stripe_len; int num_stripes; int sub_stripes; int verified_stripes; /* For mount time dev extent verification */ @@ -499,6 +569,9 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, u64 logical, u64 *length, struct btrfs_io_context **bioc_ret); +struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, + u64 logical, u64 *length_ret, + u32 *num_stripes); int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *map, enum btrfs_map_op op, u64 logical, struct btrfs_io_geometry *io_geom); @@ -507,8 +580,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); void btrfs_mapping_tree_free(struct extent_map_tree *tree); -blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio, - int mirror_num); +void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, fmode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, @@ -567,6 +639,8 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); +u64 btrfs_calc_stripe_length(const struct extent_map *em); +int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 85691dc2232f..7421abcf325a 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -272,10 +272,12 @@ out: ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) { + struct btrfs_key found_key; struct btrfs_key key; struct inode *inode = d_inode(dentry); struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path; + int iter_ret = 0; int ret = 0; size_t total_size = 0, size_left = size; @@ -294,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path->reada = READA_FORWARD; /* search for our xattrs */ - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) - goto err; - - while (1) { + btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *leaf; int slot; struct btrfs_dir_item *di; - struct btrfs_key found_key; u32 item_size; u32 cur; leaf = path->nodes[0]; slot = path->slots[0]; - /* this is where we start walking through the path */ - if (slot >= btrfs_header_nritems(leaf)) { - /* - * if we've reached the last slot in this leaf we need - * to go to the next leaf and reset everything - */ - ret = btrfs_next_leaf(root, path); - if (ret < 0) - goto err; - else if (ret > 0) - break; - continue; - } - - btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; if (found_key.type > BTRFS_XATTR_ITEM_KEY) break; if (found_key.type < BTRFS_XATTR_ITEM_KEY) - goto next_item; + continue; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); item_size = btrfs_item_size(leaf, slot); @@ -351,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) goto next; if (!buffer || (name_len + 1) > size_left) { - ret = -ERANGE; - goto err; + iter_ret = -ERANGE; + break; } read_extent_buffer(leaf, buffer, name_ptr, name_len); @@ -364,12 +345,13 @@ next: cur += this_len; di = (struct btrfs_dir_item *)((char *)di + this_len); } -next_item: - path->slots[0]++; } - ret = total_size; -err: + if (iter_ret < 0) + ret = iter_ret; + else + ret = total_size; + btrfs_free_path(path); return ret; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 767a0c6c9694..b4f44662cda7 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -97,7 +97,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, { struct workspace *workspace = list_entry(ws, struct workspace, list); int ret; - char *data_in; + char *data_in = NULL; char *cpage_out; int nr_pages = 0; struct page *in_page = NULL; @@ -126,7 +126,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[0] = out_page; nr_pages = 1; @@ -148,26 +148,26 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, int i; for (i = 0; i < in_buf_pages; i++) { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); memcpy(workspace->buf + i * PAGE_SIZE, data_in, PAGE_SIZE); start += PAGE_SIZE; } workspace->strm.next_in = workspace->buf; } else { - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } in_page = find_get_page(mapping, start >> PAGE_SHIFT); - data_in = kmap(in_page); + data_in = kmap_local_page(in_page); start += PAGE_SIZE; workspace->strm.next_in = data_in; } @@ -196,9 +196,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, * the stream end if required */ if (workspace->strm.avail_out == 0) { - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -207,7 +205,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -234,9 +232,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } else if (workspace->strm.avail_out == 0) { /* get another page for the stream end */ - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -245,7 +241,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -ENOMEM; goto out; } - cpage_out = kmap(out_page); + cpage_out = page_address(out_page); pages[nr_pages] = out_page; nr_pages++; workspace->strm.avail_out = PAGE_SIZE; @@ -264,13 +260,11 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; - if (out_page) - kunmap(out_page); - - if (in_page) { - kunmap(in_page); + if (data_in) { + kunmap_local(data_in); put_page(in_page); } + return ret; } @@ -287,7 +281,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) unsigned long buf_start; struct page **pages_in = cb->compressed_pages; - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; workspace->strm.avail_in = min_t(size_t, srclen, PAGE_SIZE); workspace->strm.total_in = 0; @@ -309,7 +303,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { pr_warn("BTRFS: inflateInit failed\n"); - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); return -EIO; } while (workspace->strm.total_in < srclen) { @@ -336,13 +330,13 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) if (workspace->strm.avail_in == 0) { unsigned long tmp; - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); page_in_index++; if (page_in_index >= total_pages_in) { data_in = NULL; break; } - data_in = kmap(pages_in[page_in_index]); + data_in = kmap_local_page(pages_in[page_in_index]); workspace->strm.next_in = data_in; tmp = srclen - workspace->strm.total_in; workspace->strm.avail_in = min(tmp, PAGE_SIZE); @@ -355,7 +349,7 @@ int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb) done: zlib_inflateEnd(&workspace->strm); if (data_in) - kunmap(pages_in[page_in_index]); + kunmap_local(data_in); if (!ret) zero_fill_bio(cb->orig_bio); return ret; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 29b54fd9c128..b150b07ba1a7 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -51,11 +51,13 @@ #define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5) /* - * Maximum supported zone size. Currently, SMR disks have a zone size of - * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not - * expect the zone size to become larger than 8GiB in the near future. + * Minimum / maximum supported zone size. Currently, SMR disks have a zone + * size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. + * We do not expect the zone size to become larger than 8GiB or smaller than + * 4MiB in the near future. */ #define BTRFS_MAX_ZONE_SIZE SZ_8G +#define BTRFS_MIN_ZONE_SIZE SZ_4M #define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT) @@ -92,9 +94,9 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, * Possible states of log buffer zones * * Empty[0] In use[0] Full[0] - * Empty[1] * x 0 - * In use[1] 0 x 0 - * Full[1] 1 1 C + * Empty[1] * 0 1 + * In use[1] x x 1 + * Full[1] 0 0 C * * Log position: * *: Special case, no superblock is written @@ -401,11 +403,28 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); ret = -EINVAL; goto out; + } else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu smaller than supported minimum %u", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MIN_ZONE_SIZE); + ret = -EINVAL; + goto out; } nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->nr_zones = nr_sectors >> ilog2(zone_sectors); + /* + * We limit max_zone_append_size also by max_segments * + * PAGE_SIZE. Technically, we can have multiple pages per segment. But, + * since btrfs adds the pages one by one to a bio, and btrfs cannot + * increase the metadata reservation even if it increases the number of + * extents, it is safe to stick with the limit. + */ + zone_info->max_zone_append_size = + min_t(u64, (u64)bdev_max_zone_append_sectors(bdev) << SECTOR_SHIFT, + (u64)bdev_max_segments(bdev) << PAGE_SHIFT); if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; @@ -631,6 +650,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) u64 zoned_devices = 0; u64 nr_devices = 0; u64 zone_size = 0; + u64 max_zone_append_size = 0; const bool incompat_zoned = btrfs_fs_incompat(fs_info, ZONED); int ret = 0; @@ -665,6 +685,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) ret = -EINVAL; goto out; } + if (!max_zone_append_size || + (zone_info->max_zone_append_size && + zone_info->max_zone_append_size < max_zone_append_size)) + max_zone_append_size = + zone_info->max_zone_append_size; } nr_devices++; } @@ -714,7 +739,11 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) } fs_info->zone_size = zone_size; + fs_info->max_zone_append_size = ALIGN_DOWN(max_zone_append_size, + fs_info->sectorsize); fs_info->fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_ZONED; + if (fs_info->max_zone_append_size < fs_info->max_extent_size) + fs_info->max_extent_size = fs_info->max_zone_append_size; /* * Check mount options here, because we might change fs_info->zoned @@ -1726,12 +1755,14 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical, &mapped_length, &bioc); if (ret || !bioc || mapped_length < PAGE_SIZE) { - btrfs_put_bioc(bioc); - return -EIO; + ret = -EIO; + goto out_put_bioc; } - if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) - return -EINVAL; + if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) { + ret = -EINVAL; + goto out_put_bioc; + } nofs_flag = memalloc_nofs_save(); nmirrors = (int)bioc->num_stripes; @@ -1750,7 +1781,8 @@ static int read_zone_info(struct btrfs_fs_info *fs_info, u64 logical, break; } memalloc_nofs_restore(nofs_flag); - +out_put_bioc: + btrfs_put_bioc(bioc); return ret; } @@ -1817,6 +1849,7 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; + struct btrfs_space_info *space_info = block_group->space_info; struct map_lookup *map; struct btrfs_device *device; u64 physical; @@ -1828,6 +1861,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) map = block_group->physical_map; + spin_lock(&space_info->lock); spin_lock(&block_group->lock); if (block_group->zone_is_active) { ret = true; @@ -1835,7 +1869,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) } /* No space left */ - if (block_group->alloc_offset == block_group->zone_capacity) { + if (btrfs_zoned_bg_is_full(block_group)) { ret = false; goto out_unlock; } @@ -1856,7 +1890,10 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) /* Successfully activated all the zones */ block_group->zone_is_active = 1; + space_info->active_total_bytes += block_group->length; spin_unlock(&block_group->lock); + btrfs_try_granting_tickets(fs_info, space_info); + spin_unlock(&space_info->lock); /* For the active block group list */ btrfs_get_block_group(block_group); @@ -1869,23 +1906,17 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) out_unlock: spin_unlock(&block_group->lock); + spin_unlock(&space_info->lock); return ret; } -int btrfs_zone_finish(struct btrfs_block_group *block_group) +static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; struct map_lookup *map; - struct btrfs_device *device; - u64 physical; int ret = 0; int i; - if (!btrfs_is_zoned(fs_info)) - return 0; - - map = block_group->physical_map; - spin_lock(&block_group->lock); if (!block_group->zone_is_active) { spin_unlock(&block_group->lock); @@ -1895,38 +1926,48 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) /* Check if we have unwritten allocated space */ if ((block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) && - block_group->alloc_offset > block_group->meta_write_pointer) { + block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) { spin_unlock(&block_group->lock); return -EAGAIN; } - spin_unlock(&block_group->lock); - - ret = btrfs_inc_block_group_ro(block_group, false); - if (ret) - return ret; - - /* Ensure all writes in this block group finish */ - btrfs_wait_block_group_reservations(block_group); - /* No need to wait for NOCOW writers. Zoned mode does not allow that. */ - btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, - block_group->length); - - spin_lock(&block_group->lock); /* - * Bail out if someone already deactivated the block group, or - * allocated space is left in the block group. + * If we are sure that the block group is full (= no more room left for + * new allocation) and the IO for the last usable block is completed, we + * don't need to wait for the other IOs. This holds because we ensure + * the sequential IO submissions using the ZONE_APPEND command for data + * and block_group->meta_write_pointer for metadata. */ - if (!block_group->zone_is_active) { + if (!fully_written) { spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return 0; - } - if (block_group->reserved) { - spin_unlock(&block_group->lock); - btrfs_dec_block_group_ro(block_group); - return -EAGAIN; + ret = btrfs_inc_block_group_ro(block_group, false); + if (ret) + return ret; + + /* Ensure all writes in this block group finish */ + btrfs_wait_block_group_reservations(block_group); + /* No need to wait for NOCOW writers. Zoned mode does not allow that */ + btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start, + block_group->length); + + spin_lock(&block_group->lock); + + /* + * Bail out if someone already deactivated the block group, or + * allocated space is left in the block group. + */ + if (!block_group->zone_is_active) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return 0; + } + + if (block_group->reserved) { + spin_unlock(&block_group->lock); + btrfs_dec_block_group_ro(block_group); + return -EAGAIN; + } } block_group->zone_is_active = 0; @@ -1936,9 +1977,10 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) btrfs_clear_data_reloc_bg(block_group); spin_unlock(&block_group->lock); + map = block_group->physical_map; for (i = 0; i < map->num_stripes; i++) { - device = map->stripes[i].dev; - physical = map->stripes[i].physical; + struct btrfs_device *device = map->stripes[i].dev; + const u64 physical = map->stripes[i].physical; if (device->zone_info->max_active_zones == 0) continue; @@ -1953,7 +1995,9 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) btrfs_dev_clear_active_zone(device, physical); } - btrfs_dec_block_group_ro(block_group); + + if (!fully_written) + btrfs_dec_block_group_ro(block_group); spin_lock(&fs_info->zone_active_bgs_lock); ASSERT(!list_empty(&block_group->active_bg_list)); @@ -1963,9 +2007,20 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group) /* For active_bg_list */ btrfs_put_block_group(block_group); + clear_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + wake_up_all(&fs_info->zone_finish_wait); + return 0; } +int btrfs_zone_finish(struct btrfs_block_group *block_group) +{ + if (!btrfs_is_zoned(block_group->fs_info)) + return 0; + + return do_zone_finish(block_group, false); +} + bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) { struct btrfs_fs_info *fs_info = fs_devices->fs_info; @@ -1991,15 +2046,16 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) } mutex_unlock(&fs_info->chunk_mutex); + if (!ret) + set_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags); + return ret; } void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { struct btrfs_block_group *block_group; - struct map_lookup *map; - struct btrfs_device *device; - u64 physical; + u64 min_alloc_bytes; if (!btrfs_is_zoned(fs_info)) return; @@ -2007,42 +2063,52 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len block_group = btrfs_lookup_block_group(fs_info, logical); ASSERT(block_group); - if (logical + length < block_group->start + block_group->zone_capacity) - goto out; - - spin_lock(&block_group->lock); + /* No MIXED_BG on zoned btrfs. */ + if (block_group->flags & BTRFS_BLOCK_GROUP_DATA) + min_alloc_bytes = fs_info->sectorsize; + else + min_alloc_bytes = fs_info->nodesize; - if (!block_group->zone_is_active) { - spin_unlock(&block_group->lock); + /* Bail out if we can allocate more data from this block group. */ + if (logical + length + min_alloc_bytes <= + block_group->start + block_group->zone_capacity) goto out; - } - block_group->zone_is_active = 0; - /* We should have consumed all the free space */ - ASSERT(block_group->alloc_offset == block_group->zone_capacity); - ASSERT(block_group->free_space_ctl->free_space == 0); - btrfs_clear_treelog_bg(block_group); - btrfs_clear_data_reloc_bg(block_group); - spin_unlock(&block_group->lock); + do_zone_finish(block_group, true); - map = block_group->physical_map; - device = map->stripes[0].dev; - physical = map->stripes[0].physical; +out: + btrfs_put_block_group(block_group); +} - if (!device->zone_info->max_active_zones) - goto out; +static void btrfs_zone_finish_endio_workfn(struct work_struct *work) +{ + struct btrfs_block_group *bg = + container_of(work, struct btrfs_block_group, zone_finish_work); - btrfs_dev_clear_active_zone(device, physical); + wait_on_extent_buffer_writeback(bg->last_eb); + free_extent_buffer(bg->last_eb); + btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length); + btrfs_put_block_group(bg); +} - spin_lock(&fs_info->zone_active_bgs_lock); - ASSERT(!list_empty(&block_group->active_bg_list)); - list_del_init(&block_group->active_bg_list); - spin_unlock(&fs_info->zone_active_bgs_lock); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) +{ + if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity) + return; - btrfs_put_block_group(block_group); + if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) { + btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing", + bg->start); + return; + } -out: - btrfs_put_block_group(block_group); + /* For the work */ + btrfs_get_block_group(bg); + atomic_inc(&eb->refs); + bg->last_eb = eb; + INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn); + queue_work(system_unbound_wq, &bg->zone_finish_work); } void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) @@ -2072,3 +2138,150 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); } + +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *device; + u64 used = 0; + u64 total = 0; + u64 factor; + + ASSERT(btrfs_is_zoned(fs_info)); + + if (fs_info->bg_reclaim_threshold == 0) + return false; + + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (!device->bdev) + continue; + + total += device->disk_total_bytes; + used += device->bytes_used; + } + mutex_unlock(&fs_devices->device_list_mutex); + + factor = div64_u64(used * 100, total); + return factor >= fs_info->bg_reclaim_threshold; +} + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} + +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group *block_group; + struct btrfs_block_group *min_bg = NULL; + u64 min_avail = U64_MAX; + int ret; + + spin_lock(&fs_info->zone_active_bgs_lock); + list_for_each_entry(block_group, &fs_info->zone_active_bgs, + active_bg_list) { + u64 avail; + + spin_lock(&block_group->lock); + if (block_group->reserved || + (block_group->flags & BTRFS_BLOCK_GROUP_SYSTEM)) { + spin_unlock(&block_group->lock); + continue; + } + + avail = block_group->zone_capacity - block_group->alloc_offset; + if (min_avail > avail) { + if (min_bg) + btrfs_put_block_group(min_bg); + min_bg = block_group; + min_avail = avail; + btrfs_get_block_group(min_bg); + } + spin_unlock(&block_group->lock); + } + spin_unlock(&fs_info->zone_active_bgs_lock); + + if (!min_bg) + return 0; + + ret = btrfs_zone_finish(min_bg); + btrfs_put_block_group(min_bg); + + return ret < 0 ? ret : 1; +} + +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + struct btrfs_block_group *bg; + int index; + + if (!btrfs_is_zoned(fs_info) || (space_info->flags & BTRFS_BLOCK_GROUP_DATA)) + return 0; + + /* No more block groups to activate */ + if (space_info->active_total_bytes == space_info->total_bytes) + return 0; + + for (;;) { + int ret; + bool need_finish = false; + + down_read(&space_info->groups_sem); + for (index = 0; index < BTRFS_NR_RAID_TYPES; index++) { + list_for_each_entry(bg, &space_info->block_groups[index], + list) { + if (!spin_trylock(&bg->lock)) + continue; + if (btrfs_zoned_bg_is_full(bg) || bg->zone_is_active) { + spin_unlock(&bg->lock); + continue; + } + spin_unlock(&bg->lock); + + if (btrfs_zone_activate(bg)) { + up_read(&space_info->groups_sem); + return 1; + } + + need_finish = true; + } + } + up_read(&space_info->groups_sem); + + if (!do_finish || !need_finish) + break; + + ret = btrfs_zone_finish_one_bg(fs_info); + if (ret == 0) + break; + if (ret < 0) + return ret; + } + + return 0; +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 6dee76248cb4..e17462db3a84 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -10,11 +10,7 @@ #include "block-group.h" #include "btrfs_inode.h" -/* - * Block groups with more than this value (percents) of unusable space will be - * scheduled for background reclaim. - */ -#define BTRFS_DEFAULT_RECLAIM_THRESH 75 +#define BTRFS_DEFAULT_RECLAIM_THRESH (75) struct btrfs_zoned_device_info { /* @@ -23,6 +19,7 @@ struct btrfs_zoned_device_info { */ u64 zone_size; u8 zone_size_shift; + u64 max_zone_append_size; u32 nr_zones; unsigned int max_active_zones; atomic_t active_zones_left; @@ -76,8 +73,16 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group); bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags); void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length); +void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); +bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); +int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info); +int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, bool do_finish); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -233,9 +238,34 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length) { } +static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg, + struct extent_buffer *eb) { } + static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + +static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info) +{ + return false; +} + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } + +static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info) +{ + return 1; +} + +static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + bool do_finish) +{ + /* Consider all the block groups are active */ + return 0; +} + #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) @@ -370,4 +400,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode) mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock); } +static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg) +{ + ASSERT(btrfs_is_zoned(bg->fs_info)); + return (bg->alloc_offset == bg->zone_capacity); +} + #endif diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index fc42dd0badd7..35a0224d4eb7 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list) void zstd_free_workspace(struct list_head *ws); struct list_head *zstd_alloc_workspace(unsigned int level); -/* - * zstd_reclaim_timer_fn - reclaim timer + +/** + * Timer callback to free unused workspaces. + * * @t: timer * * This scans the lru_list and attempts to reclaim any workspace that hasn't * been used for ZSTD_BTRFS_RECLAIM_JIFFIES. + * + * The context is softirq and does not need the _bh locking primitives. */ static void zstd_reclaim_timer_fn(struct timer_list *timer) { unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES; struct list_head *pos, *next; - spin_lock_bh(&wsm.lock); + spin_lock(&wsm.lock); if (list_empty(&wsm.lru_list)) { - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); return; } @@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) if (!list_empty(&wsm.lru_list)) mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES); - spin_unlock_bh(&wsm.lock); + spin_unlock(&wsm.lock); } /* @@ -399,7 +403,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* map in the first page of input data */ in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); @@ -411,7 +415,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -446,9 +450,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, if (workspace->out_buf.pos == workspace->out_buf.size) { tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -458,7 +460,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); @@ -473,13 +475,12 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, /* Check if we need more input */ if (workspace->in_buf.pos == workspace->in_buf.size) { tot_in += PAGE_SIZE; - kunmap(in_page); + kunmap_local(workspace->in_buf.src); put_page(in_page); - start += PAGE_SIZE; len -= PAGE_SIZE; in_page = find_get_page(mapping, start >> PAGE_SHIFT); - workspace->in_buf.src = kmap(in_page); + workspace->in_buf.src = kmap_local_page(in_page); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); } @@ -506,9 +507,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, tot_out += PAGE_SIZE; max_out -= PAGE_SIZE; - kunmap(out_page); if (nr_pages == nr_dest_pages) { - out_page = NULL; ret = -E2BIG; goto out; } @@ -518,7 +517,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, goto out; } pages[nr_pages++] = out_page; - workspace->out_buf.dst = kmap(out_page); + workspace->out_buf.dst = page_address(out_page); workspace->out_buf.pos = 0; workspace->out_buf.size = min_t(size_t, max_out, PAGE_SIZE); } @@ -533,13 +532,10 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, *total_out = tot_out; out: *out_pages = nr_pages; - /* Cleanup */ - if (in_page) { - kunmap(in_page); + if (workspace->in_buf.src) { + kunmap_local(workspace->in_buf.src); put_page(in_page); } - if (out_page) - kunmap(out_page); return ret; } @@ -563,7 +559,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) goto done; } - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); @@ -599,14 +595,15 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) break; if (workspace->in_buf.pos == workspace->in_buf.size) { - kunmap(pages_in[page_in_index++]); + kunmap_local(workspace->in_buf.src); + page_in_index++; if (page_in_index >= total_pages_in) { workspace->in_buf.src = NULL; ret = -EIO; goto done; } srclen -= PAGE_SIZE; - workspace->in_buf.src = kmap(pages_in[page_in_index]); + workspace->in_buf.src = kmap_local_page(pages_in[page_in_index]); workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, srclen, PAGE_SIZE); } @@ -615,7 +612,7 @@ int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb) zero_fill_bio(cb->orig_bio); done: if (workspace->in_buf.src) - kunmap(pages_in[page_in_index]); + kunmap_local(workspace->in_buf.src); return ret; } |