diff options
author | Kent Overstreet <kent.overstreet@gmail.com> | 2017-12-06 06:17:57 -0500 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@gmail.com> | 2018-05-22 00:44:18 -0400 |
commit | f29ea9e94f79bab72e9788bccae01c9220da5223 (patch) | |
tree | f3c8b67cf05cdc65d6fbf07bee37279ec6217497 | |
parent | 50ed8c955d4cc962ab3b736d0264ae4fee5156d5 (diff) |
bcachefs: Change open_bucket to only point to a single bucket
Solves a deadlock, and also a major simplification to the allocation
code
-rw-r--r-- | fs/bcachefs/alloc.c | 622 | ||||
-rw-r--r-- | fs/bcachefs/alloc.h | 60 | ||||
-rw-r--r-- | fs/bcachefs/alloc_types.h | 18 | ||||
-rw-r--r-- | fs/bcachefs/bcachefs.h | 21 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 21 | ||||
-rw-r--r-- | fs/bcachefs/btree_types.h | 12 | ||||
-rw-r--r-- | fs/bcachefs/btree_update_interior.c | 66 | ||||
-rw-r--r-- | fs/bcachefs/io.c | 29 | ||||
-rw-r--r-- | fs/bcachefs/io.h | 35 | ||||
-rw-r--r-- | fs/bcachefs/io_types.h | 1 | ||||
-rw-r--r-- | fs/bcachefs/journal.c | 33 | ||||
-rw-r--r-- | include/trace/events/bcachefs.h | 50 |
12 files changed, 382 insertions, 586 deletions
diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index a5babb1ac68d..59dc26ee5484 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -584,15 +584,15 @@ static void bch2_invalidate_one_bucket(struct bch_fs *c, struct bch_dev *ca, { struct bucket_mark m; - spin_lock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); if (!bch2_invalidate_bucket(ca, g, &m)) { - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); return; } verify_not_on_freelist(c, ca, g - ca->buckets); BUG_ON(!fifo_push(&ca->free_inc, g - ca->buckets)); - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); g->prio[READ] = c->prio_clock[READ].hand; g->prio[WRITE] = c->prio_clock[WRITE].hand; @@ -821,7 +821,7 @@ static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, * Don't remove from free_inc until after it's added to * freelist, so gc can find it: */ - spin_lock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); for (i = 0; i < RESERVE_NR; i++) if (fifo_push(&ca->free[i], bucket)) { fifo_pop(&ca->free_inc, bucket); @@ -829,7 +829,7 @@ static void discard_invalidated_bucket(struct bch_fs *c, struct bch_dev *ca, pushed = true; break; } - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); if (pushed) break; @@ -940,12 +940,12 @@ static int bch2_allocator_thread(void *arg) BUG_ON(ca->free_inc.front); - spin_lock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); sort(ca->free_inc.data, ca->free_inc.back, sizeof(ca->free_inc.data[0]), size_t_cmp, NULL); - spin_unlock(&ca->freelist_lock); + spin_unlock(&c->freelist_lock); /* * free_inc is now full of newly-invalidated buckets: next, @@ -957,6 +957,54 @@ static int bch2_allocator_thread(void *arg) /* Allocation */ /* + * Open buckets represent a bucket that's currently being allocated from. They + * serve two purposes: + * + * - They track buckets that have been partially allocated, allowing for + * sub-bucket sized allocations - they're used by the sector allocator below + * + * - They provide a reference to the buckets they own that mark and sweep GC + * can find, until the new allocation has a pointer to it inserted into the + * btree + * + * When allocating some space with the sector allocator, the allocation comes + * with a reference to an open bucket - the caller is required to put that + * reference _after_ doing the index update that makes its allocation reachable. + */ + +void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = c->devs[ob->ptr.dev]; + + spin_lock(&ob->lock); + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ob->ptr), false); + ob->valid = false; + spin_unlock(&ob->lock); + + spin_lock(&c->freelist_lock); + ob->freelist = c->open_buckets_freelist; + c->open_buckets_freelist = ob - c->open_buckets; + c->open_buckets_nr_free++; + spin_unlock(&c->freelist_lock); + + closure_wake_up(&c->open_buckets_wait); +} + +static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) +{ + struct open_bucket *ob; + + BUG_ON(!c->open_buckets_freelist || !c->open_buckets_nr_free); + + ob = c->open_buckets + c->open_buckets_freelist; + c->open_buckets_freelist = ob->freelist; + atomic_set(&ob->pin, 1); + + c->open_buckets_nr_free--; + return ob; +} + +/* * XXX: allocation on startup is still sketchy. There is insufficient * synchronization for bch2_bucket_alloc_startup() to work correctly after * bch2_alloc_write() has been called, and we aren't currently doing anything @@ -995,6 +1043,18 @@ out: return r; } +static inline unsigned open_buckets_reserved(enum alloc_reserve reserve) +{ + switch (reserve) { + case RESERVE_ALLOC: + return 0; + case RESERVE_BTREE: + return BTREE_NODE_RESERVE / 2; + default: + return BTREE_NODE_RESERVE; + } +} + /** * bch_bucket_alloc - allocate a single bucket from a specific device * @@ -1003,77 +1063,85 @@ out: int bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum alloc_reserve reserve, bool may_alloc_partial, - struct open_bucket_ptr *ret) + struct closure *cl) { - size_t r; + struct open_bucket *ob; + long bucket; - spin_lock(&ca->freelist_lock); + spin_lock(&c->freelist_lock); if (may_alloc_partial && ca->open_buckets_partial_nr) { - *ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; - spin_unlock(&ca->freelist_lock); - return 0; + int ret = ca->open_buckets_partial[--ca->open_buckets_partial_nr]; + c->open_buckets[ret].on_partial_list = false; + spin_unlock(&c->freelist_lock); + return ret; } - if (likely(fifo_pop(&ca->free[RESERVE_NONE], r))) + if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(reserve))) { + if (cl) + closure_wait(&c->open_buckets_wait, cl); + spin_unlock(&c->freelist_lock); + trace_open_bucket_alloc_fail(ca, reserve); + return OPEN_BUCKETS_EMPTY; + } + + if (likely(fifo_pop(&ca->free[RESERVE_NONE], bucket))) goto out; switch (reserve) { case RESERVE_ALLOC: - if (fifo_pop(&ca->free[RESERVE_BTREE], r)) + if (fifo_pop(&ca->free[RESERVE_BTREE], bucket)) goto out; break; case RESERVE_BTREE: if (fifo_used(&ca->free[RESERVE_BTREE]) * 2 >= ca->free[RESERVE_BTREE].size && - fifo_pop(&ca->free[RESERVE_BTREE], r)) + fifo_pop(&ca->free[RESERVE_BTREE], bucket)) goto out; break; case RESERVE_MOVINGGC: - if (fifo_pop(&ca->free[RESERVE_MOVINGGC], r)) + if (fifo_pop(&ca->free[RESERVE_MOVINGGC], bucket)) goto out; break; default: break; } - spin_unlock(&ca->freelist_lock); - if (unlikely(!ca->alloc_thread_started) && (reserve == RESERVE_ALLOC) && - (r = bch2_bucket_alloc_startup(c, ca)) >= 0) { - verify_not_on_freelist(c, ca, r); - goto out2; - } + (bucket = bch2_bucket_alloc_startup(c, ca)) >= 0) + goto out; + + spin_unlock(&c->freelist_lock); trace_bucket_alloc_fail(ca, reserve); - return -1; + return FREELIST_EMPTY; out: - verify_not_on_freelist(c, ca, r); - spin_unlock(&ca->freelist_lock); + verify_not_on_freelist(c, ca, bucket); - bch2_wake_allocator(ca); -out2: - *ret = (struct open_bucket_ptr) { - .ptr.gen = ca->buckets[r].mark.gen, - .ptr.offset = bucket_to_sector(ca, r), - .ptr.dev = ca->dev_idx, - .sectors_free = ca->mi.bucket_size, + ob = bch2_open_bucket_alloc(c); + + spin_lock(&ob->lock); + ob->valid = true; + ob->sectors_free = ca->mi.bucket_size; + ob->ptr = (struct bch_extent_ptr) { + .gen = ca->buckets[bucket].mark.gen, + .offset = bucket_to_sector(ca, bucket), + .dev = ca->dev_idx, }; + spin_unlock(&ob->lock); - ca->buckets[r].prio[READ] = c->prio_clock[READ].hand; - ca->buckets[r].prio[WRITE] = c->prio_clock[WRITE].hand; + spin_unlock(&c->freelist_lock); + + bch2_wake_allocator(ca); + + ca->buckets[bucket].prio[READ] = c->prio_clock[READ].hand; + ca->buckets[bucket].prio[WRITE] = c->prio_clock[WRITE].hand; trace_bucket_alloc(ca, reserve); - return 0; + return ob - c->open_buckets; } -enum bucket_alloc_ret { - ALLOC_SUCCESS, - NO_DEVICES, /* -EROFS */ - FREELIST_EMPTY, /* Allocator thread not keeping up */ -}; - struct dev_alloc_list bch2_wp_alloc_list(struct bch_fs *c, struct write_point *wp, struct bch_devs_mask *devs) @@ -1121,40 +1189,42 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, unsigned nr_replicas, enum alloc_reserve reserve, - struct bch_devs_mask *devs) + struct bch_devs_mask *devs, + struct closure *cl) { enum bucket_alloc_ret ret = NO_DEVICES; - struct open_bucket *ob = wp->ob; struct dev_alloc_list devs_sorted; u64 buckets_free; unsigned i; - BUG_ON(nr_replicas > ARRAY_SIZE(ob->ptrs)); + BUG_ON(nr_replicas > ARRAY_SIZE(wp->ptrs)); - if (ob->nr_ptrs >= nr_replicas) + if (wp->nr_ptrs >= nr_replicas) return ALLOC_SUCCESS; rcu_read_lock(); devs_sorted = bch2_wp_alloc_list(c, wp, devs); - spin_lock(&ob->lock); for (i = 0; i < devs_sorted.nr; i++) { struct bch_dev *ca = rcu_dereference(c->devs[devs_sorted.devs[i]]); - struct open_bucket_ptr ptr; + int ob; if (!ca) continue; - if (bch2_bucket_alloc(c, ca, reserve, - wp->type == BCH_DATA_USER, - &ptr)) { - ret = FREELIST_EMPTY; + ob = bch2_bucket_alloc(c, ca, reserve, + wp->type == BCH_DATA_USER, cl); + if (ob < 0) { + ret = ob; + if (ret == OPEN_BUCKETS_EMPTY) + break; continue; } - BUG_ON(ob->nr_ptrs >= ARRAY_SIZE(ob->ptrs)); - ob->ptrs[ob->nr_ptrs++] = ptr; + BUG_ON(ob <= 0 || ob > U8_MAX); + BUG_ON(wp->nr_ptrs >= ARRAY_SIZE(wp->ptrs)); + wp->ptrs[wp->nr_ptrs++] = c->open_buckets + ob; buckets_free = U64_MAX, dev_buckets_free(ca); if (buckets_free) @@ -1167,14 +1237,15 @@ static enum bucket_alloc_ret __bch2_bucket_alloc_set(struct bch_fs *c, __clear_bit(ca->dev_idx, devs->d); - if (ob->nr_ptrs == nr_replicas) { + if (wp->nr_ptrs == nr_replicas) { ret = ALLOC_SUCCESS; break; } } - EBUG_ON(ret != ALLOC_SUCCESS && reserve == RESERVE_MOVINGGC); - spin_unlock(&ob->lock); + EBUG_ON(reserve == RESERVE_MOVINGGC && + ret != ALLOC_SUCCESS && + ret != OPEN_BUCKETS_EMPTY); rcu_read_unlock(); return ret; } @@ -1189,7 +1260,7 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, while (1) { switch (__bch2_bucket_alloc_set(c, wp, nr_replicas, - reserve, devs)) { + reserve, devs, cl)) { case ALLOC_SUCCESS: if (waiting) closure_wake_up(&c->freelist_wait); @@ -1202,10 +1273,6 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, return -EROFS; case FREELIST_EMPTY: - if (!cl || waiting) - trace_freelist_empty_fail(c, - reserve, cl); - if (!cl) return -ENOSPC; @@ -1216,199 +1283,67 @@ static int bch2_bucket_alloc_set(struct bch_fs *c, struct write_point *wp, closure_wait(&c->freelist_wait, cl); waiting = true; break; + case OPEN_BUCKETS_EMPTY: + return cl ? -EAGAIN : -ENOSPC; default: BUG(); } } } -/* Open buckets: */ - -/* - * Open buckets represent one or more buckets (on multiple devices) that are - * currently being allocated from. They serve two purposes: - * - * - They track buckets that have been partially allocated, allowing for - * sub-bucket sized allocations - they're used by the sector allocator below - * - * - They provide a reference to the buckets they own that mark and sweep GC - * can find, until the new allocation has a pointer to it inserted into the - * btree - * - * When allocating some space with the sector allocator, the allocation comes - * with a reference to an open bucket - the caller is required to put that - * reference _after_ doing the index update that makes its allocation reachable. - */ +/* Sector allocator */ -void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +static void writepoint_drop_ptrs(struct bch_fs *c, + struct write_point *wp, + struct bch_devs_mask *devs, + unsigned nr_ptrs_dislike) { - const struct open_bucket_ptr *ptr; - u8 new_ob; + int i; - if (!atomic_dec_and_test(&ob->pin)) + if (!nr_ptrs_dislike) return; - down_read(&c->alloc_gc_lock); - spin_lock(&ob->lock); - - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->ptr.dev]; + for (i = wp->nr_ptrs - 1; i >= 0; --i) { + struct open_bucket *ob = wp->ptrs[i]; + struct bch_dev *ca = c->devs[ob->ptr.dev]; - if (ptr->sectors_free) { - /* - * This is a ptr to a bucket that still has free space, - * but we don't want to use it - */ + if (nr_ptrs_dislike && !test_bit(ob->ptr.dev, devs->d)) { BUG_ON(ca->open_buckets_partial_nr >= ARRAY_SIZE(ca->open_buckets_partial)); - spin_lock(&ca->freelist_lock); - ca->open_buckets_partial[ca->open_buckets_partial_nr++] - = *ptr; - spin_unlock(&ca->freelist_lock); - } else { - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), false); - } - } - ob->nr_ptrs = 0; - - spin_unlock(&ob->lock); - up_read(&c->alloc_gc_lock); - - new_ob = ob->new_ob; - ob->new_ob = 0; - - spin_lock(&c->open_buckets_lock); - ob->freelist = c->open_buckets_freelist; - c->open_buckets_freelist = ob - c->open_buckets; - c->open_buckets_nr_free++; - spin_unlock(&c->open_buckets_lock); - - closure_wake_up(&c->open_buckets_wait); - - if (new_ob) - bch2_open_bucket_put(c, c->open_buckets + new_ob); -} - -static struct open_bucket *bch2_open_bucket_get(struct bch_fs *c, - unsigned nr_reserved, - struct closure *cl) -{ - struct open_bucket *ret; - - spin_lock(&c->open_buckets_lock); - - if (c->open_buckets_nr_free > nr_reserved) { - BUG_ON(!c->open_buckets_freelist); - - ret = c->open_buckets + c->open_buckets_freelist; - c->open_buckets_freelist = ret->freelist; - atomic_set(&ret->pin, 1); /* XXX */ + spin_lock(&c->freelist_lock); + ob->on_partial_list = true; + ca->open_buckets_partial[ca->open_buckets_partial_nr++] = + ob - c->open_buckets; + spin_unlock(&c->freelist_lock); - BUG_ON(ret->new_ob); - BUG_ON(ret->nr_ptrs); - - c->open_buckets_nr_free--; - trace_open_bucket_alloc(c, cl); - } else { - trace_open_bucket_alloc_fail(c, cl); - - if (cl) { - closure_wait(&c->open_buckets_wait, cl); - ret = ERR_PTR(-EAGAIN); - } else - ret = ERR_PTR(-ENOSPC); - } - - spin_unlock(&c->open_buckets_lock); - - return ret; -} - -static int open_bucket_drop_ptrs(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs, - unsigned nr_ptrs_dislike, - struct closure *cl) -{ - struct open_bucket *src = wp->ob, *dst; - bool moved_ptr = false; - int i; - - dst = bch2_open_bucket_get(c, wp->type == BCH_DATA_BTREE - ? 0 : BTREE_NODE_RESERVE, cl); - if (IS_ERR(dst)) - return PTR_ERR(dst); + closure_wake_up(&c->open_buckets_wait); + closure_wake_up(&c->freelist_wait); - down_read(&c->alloc_gc_lock); + wp->nr_ptrs--; + memmove(&wp->ptrs[i], + &wp->ptrs[i + 1], + (wp->nr_ptrs - i) * sizeof(wp->ptrs[0])); - if (dst < src) { - spin_lock(&dst->lock); - spin_lock_nested(&src->lock, 1); - } else { - spin_lock(&src->lock); - spin_lock_nested(&dst->lock, 1); - } - - for (i = src->nr_ptrs - 1; i >= 0; --i) { - if (!src->ptrs[i].sectors_free) { - /* - * Don't do anything: leave the ptr on the old - * open_bucket for gc to find - */ - } else if (nr_ptrs_dislike && - !test_bit(src->ptrs[i].ptr.dev, devs->d)) { - /* - * We don't want this pointer; bch2_open_bucket_put() - * will stick it on ca->open_buckets_partial to be - * reused - */ --nr_ptrs_dislike; - } else { - BUG_ON(dst->nr_ptrs >= ARRAY_SIZE(dst->ptrs)); - - dst->ptrs[dst->nr_ptrs++] = src->ptrs[i]; - - src->nr_ptrs--; - memmove(&src->ptrs[i], - &src->ptrs[i + 1], - (src->nr_ptrs - i) * sizeof(src->ptrs[0])); - - moved_ptr = true; } } - - if (moved_ptr) { - BUG_ON(src->new_ob); - - atomic_inc(&dst->pin); - src->new_ob = dst - c->open_buckets; - } - - spin_unlock(&dst->lock); - spin_unlock(&src->lock); - up_read(&c->alloc_gc_lock); - - bch2_open_bucket_put(c, src); - wp->ob = dst; - return 0; } -static void verify_not_stale(struct bch_fs *c, const struct open_bucket *ob) +static void verify_not_stale(struct bch_fs *c, const struct write_point *wp) { #ifdef CONFIG_BCACHEFS_DEBUG - const struct open_bucket_ptr *ptr; + struct open_bucket *ob; + unsigned i; - open_bucket_for_each_ptr(ob, ptr) { - struct bch_dev *ca = c->devs[ptr->ptr.dev]; + writepoint_for_each_ptr(wp, ob, i) { + struct bch_dev *ca = c->devs[ob->ptr.dev]; - BUG_ON(ptr_stale(ca, &ptr->ptr)); + BUG_ON(ptr_stale(ca, &ob->ptr)); } #endif } -/* Sector allocator */ - static int open_bucket_add_buckets(struct bch_fs *c, struct bch_devs_mask *_devs, struct write_point *wp, @@ -1418,19 +1353,18 @@ static int open_bucket_add_buckets(struct bch_fs *c, struct closure *cl) { struct bch_devs_mask devs = c->rw_devs[wp->type]; - struct open_bucket_ptr *ptr; + struct open_bucket *ob; unsigned i; - if (wp->ob->nr_ptrs >= nr_replicas) + if (wp->nr_ptrs >= nr_replicas) return 0; /* Don't allocate from devices we already have pointers to: */ for (i = 0; i < devs_have->nr; i++) __clear_bit(devs_have->devs[i], devs.d); - open_bucket_for_each_ptr(wp->ob, ptr) - if (ptr->sectors_free) - __clear_bit(ptr->ptr.dev, devs.d); + writepoint_for_each_ptr(wp, ob, i) + __clear_bit(ob->ptr.dev, devs.d); if (_devs) bitmap_and(devs.d, devs.d, _devs->d, BCH_SB_MEMBERS_MAX); @@ -1522,104 +1456,72 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *c, unsigned flags, struct closure *cl) { - struct open_bucket *ob; struct write_point *wp; - struct open_bucket_ptr *ptr; - unsigned nr_ptrs_empty = 0, nr_ptrs_dislike = 0, nr_ptrs_have = 0; + struct open_bucket *ob; + unsigned i, nr_ptrs_dislike = 0, nr_ptrs_have = 0; int ret; BUG_ON(!nr_replicas || !nr_replicas_required); wp = writepoint_find(c, write_point.v); - ob = wp->ob; - - if (!ob) { - ob = bch2_open_bucket_get(c, wp->type == BCH_DATA_BTREE - ? 0 : BTREE_NODE_RESERVE, cl); - if (IS_ERR(ob)) { - ret = PTR_ERR(ob); - goto err; - } - - wp->ob = ob; - } /* does ob have ptrs we don't need? */ - open_bucket_for_each_ptr(ob, ptr) { - if (!ptr->sectors_free) - nr_ptrs_empty++; - else if (bch2_dev_list_has_dev(*devs_have, ptr->ptr.dev)) + writepoint_for_each_ptr(wp, ob, i) + if (bch2_dev_list_has_dev(*devs_have, ob->ptr.dev)) nr_ptrs_have++; - else if (devs && !test_bit(ptr->ptr.dev, devs->d)) + else if (devs && !test_bit(ob->ptr.dev, devs->d)) nr_ptrs_dislike++; - } ret = open_bucket_add_buckets(c, devs, wp, devs_have, - nr_replicas + nr_ptrs_empty + nr_ptrs_have + nr_ptrs_dislike, + nr_replicas + nr_ptrs_have + nr_ptrs_dislike, reserve, cl); if (ret && ret != -EROFS) goto err; - if (ob->nr_ptrs < - nr_ptrs_empty + nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) { + if (wp->nr_ptrs < + nr_ptrs_have + nr_ptrs_dislike + nr_replicas_required) { ret = -EROFS; goto err; } - /* - * If ob->sectors_free == 0, one or more of the buckets ob points to is - * full. We can't drop pointers from an open bucket - garbage collection - * still needs to find them; instead, we must allocate a new open bucket - * and copy any pointers to non-full buckets into the new open bucket. - */ - if ((int) ob->nr_ptrs - nr_ptrs_empty - nr_ptrs_dislike < nr_replicas) - nr_ptrs_dislike = clamp_t(int, ob->nr_ptrs - nr_ptrs_empty - nr_replicas, + if ((int) wp->nr_ptrs - nr_ptrs_dislike < nr_replicas) + nr_ptrs_dislike = clamp_t(int, wp->nr_ptrs - nr_replicas, 0, nr_ptrs_dislike); - if (nr_ptrs_empty || nr_ptrs_dislike) { - /* Remove pointers we don't want to use: */ - ret = open_bucket_drop_ptrs(c, wp, devs, nr_ptrs_dislike, cl); - if (ret) - goto err; - - ob = wp->ob; - } + /* Remove pointers we don't want to use: */ + writepoint_drop_ptrs(c, wp, devs, nr_ptrs_dislike); /* * Move pointers to devices we already have to end of open bucket - * pointer list: + * pointer list - note that removing pointers we don't want to use might + * have changed nr_ptrs_have: */ if (nr_ptrs_have) { - /* - * Removing pointers we don't want to use might have changed - * nr_ptrs_have: - */ - nr_ptrs_have = 0; - ptr = ob->ptrs; - - while (ptr < ob->ptrs + ob->nr_ptrs - nr_ptrs_have) - if (bch2_dev_list_has_dev(*devs_have, ptr->ptr.dev)) { + i = nr_ptrs_have = 0; + while (i < wp->nr_ptrs - nr_ptrs_have) + if (bch2_dev_list_has_dev(*devs_have, wp->ptrs[i]->ptr.dev)) { nr_ptrs_have++; - swap(*ptr, ob->ptrs[ob->nr_ptrs - nr_ptrs_have]); + swap(wp->ptrs[i], wp->ptrs[wp->nr_ptrs - nr_ptrs_have]); } else { - ptr++; + i++; } - - wp->nr_ptrs_have = nr_ptrs_have; } - BUG_ON(ob->nr_ptrs - nr_ptrs_have < nr_replicas_required); + wp->nr_ptrs_can_use = + min_t(unsigned, nr_replicas, wp->nr_ptrs - nr_ptrs_have); + + BUG_ON(wp->nr_ptrs_can_use < nr_replicas_required || + wp->nr_ptrs_can_use > wp->nr_ptrs); wp->sectors_free = UINT_MAX; - for (ptr = ob->ptrs; - ptr < ob->ptrs + min_t(int, ob->nr_ptrs - nr_ptrs_have, nr_replicas); - ptr++) - wp->sectors_free = min(wp->sectors_free, ptr->sectors_free); + for (i = 0; i < wp->nr_ptrs_can_use; i++) + wp->sectors_free = min(wp->sectors_free, + wp->ptrs[i]->sectors_free); BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); - verify_not_stale(c, ob); + verify_not_stale(c, wp); return wp; err: @@ -1632,34 +1534,27 @@ err: * as allocated out of @ob */ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, - struct bkey_i_extent *e, unsigned nr_replicas, - unsigned sectors) + struct bkey_i_extent *e, unsigned sectors) { - struct open_bucket *ob = wp->ob; - struct bch_extent_ptr tmp; - struct open_bucket_ptr *ptr; - unsigned nr_ptrs = min_t(u8, ob->nr_ptrs - wp->nr_ptrs_have, nr_replicas); + unsigned i; - /* - * We're keeping any existing pointer k has, and appending new pointers: - * __bch2_write() will only write to the pointers we add here: - */ + BUG_ON(sectors > wp->sectors_free); + wp->sectors_free -= sectors; - for (ptr = ob->ptrs; ptr < ob->ptrs + nr_ptrs; ptr++) { - struct bch_dev *ca = c->devs[ptr->ptr.dev]; + for (i = 0; i < wp->nr_ptrs_can_use; i++) { + struct open_bucket *ob = wp->ptrs[i]; + struct bch_dev *ca = c->devs[ob->ptr.dev]; + struct bch_extent_ptr tmp = ob->ptr; - EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ptr->ptr.dev)); + EBUG_ON(bch2_extent_has_device(extent_i_to_s_c(e), ob->ptr.dev)); - tmp = ptr->ptr; tmp.cached = bkey_extent_is_cached(&e->k); - tmp.offset += ca->mi.bucket_size - ptr->sectors_free; + tmp.offset += ca->mi.bucket_size - ob->sectors_free; extent_ptr_append(e, tmp); - BUG_ON(sectors > ptr->sectors_free); - ptr->sectors_free -= sectors; + BUG_ON(sectors > ob->sectors_free); + ob->sectors_free -= sectors; } - - wp->sectors_free -= sectors; } /* @@ -1668,68 +1563,23 @@ void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, */ void bch2_alloc_sectors_done(struct bch_fs *c, struct write_point *wp) { - struct open_bucket_ptr *ptr; + int i; - atomic_inc(&wp->ob->pin); + for (i = wp->nr_ptrs - 1; i >= 0; --i) { + struct open_bucket *ob = wp->ptrs[i]; - open_bucket_for_each_ptr(wp->ob, ptr) - if (!ptr->sectors_free) { - open_bucket_drop_ptrs(c, wp, NULL, 0, NULL); - break; + if (!ob->sectors_free) { + wp->nr_ptrs--; + memmove(&wp->ptrs[i], + &wp->ptrs[i + 1], + (wp->nr_ptrs - i) * sizeof(wp->ptrs[0])); + bch2_open_bucket_put(c, ob); } + } mutex_unlock(&wp->lock); } -/* - * Allocates some space in the cache to write to, and k to point to the newly - * allocated space, and updates k->size and k->offset (to point to the - * end of the newly allocated space). - * - * May allocate fewer sectors than @sectors, k->size indicates how many - * sectors were actually allocated. - * - * Return codes: - * - -EAGAIN: closure was added to waitlist - * - -ENOSPC: out of space and no closure provided - * - * @c - filesystem. - * @wp - write point to use for allocating sectors. - * @k - key to return the allocated space information. - * @cl - closure to wait for a bucket - */ -struct open_bucket *bch2_alloc_sectors(struct bch_fs *c, - struct bch_devs_mask *devs, - struct write_point_specifier write_point, - struct bkey_i_extent *e, - unsigned nr_replicas, - unsigned nr_replicas_required, - enum alloc_reserve reserve, - unsigned flags, - struct closure *cl) -{ - struct write_point *wp; - struct open_bucket *ob; - struct bch_devs_list devs_have = bch2_extent_devs(extent_i_to_s_c(e)); - - wp = bch2_alloc_sectors_start(c, devs, write_point, &devs_have, - nr_replicas, nr_replicas_required, - reserve, flags, cl); - if (IS_ERR_OR_NULL(wp)) - return ERR_CAST(wp); - - ob = wp->ob; - - if (e->k.size > wp->sectors_free) - bch2_key_resize(&e->k, wp->sectors_free); - - bch2_alloc_sectors_append_ptrs(c, wp, e, nr_replicas, e->k.size); - - bch2_alloc_sectors_done(c, wp); - - return ob; -} - /* Startup/shutdown (ro/rw): */ void bch2_recalc_capacity(struct bch_fs *c) @@ -1835,42 +1685,15 @@ set_capacity: closure_wake_up(&c->freelist_wait); } -static bool open_bucket_has_device(struct open_bucket *ob, - struct bch_dev *ca) -{ - struct open_bucket_ptr *ptr; - bool ret = false; - - spin_lock(&ob->lock); - open_bucket_for_each_ptr(ob, ptr) - ret |= ptr->ptr.dev == ca->dev_idx; - spin_unlock(&ob->lock); - - return ret; -} - static void bch2_stop_write_point(struct bch_fs *c, struct bch_dev *ca, struct write_point *wp) { struct bch_devs_mask not_self; - struct closure cl; - closure_init_stack(&cl); bitmap_complement(not_self.d, ca->self.d, BCH_SB_MEMBERS_MAX); -retry: - mutex_lock(&wp->lock); - if (!wp->ob || !open_bucket_has_device(wp->ob, ca)) { - mutex_unlock(&wp->lock); - return; - } - if (open_bucket_drop_ptrs(c, wp, ¬_self, wp->ob->nr_ptrs, &cl)) { - mutex_unlock(&wp->lock); - closure_sync(&cl); - goto retry; - } - - BUG_ON(open_bucket_has_device(wp->ob, ca)); + mutex_lock(&wp->lock); + writepoint_drop_ptrs(c, wp, ¬_self, wp->nr_ptrs); mutex_unlock(&wp->lock); } @@ -1881,9 +1704,13 @@ static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); - ob++) - if (atomic_read(&ob->pin)) - ret |= open_bucket_has_device(ob, ca); + ob++) { + spin_lock(&ob->lock); + if (ob->valid && !ob->on_partial_list && + ob->ptr.dev == ca->dev_idx) + ret = true; + spin_unlock(&ob->lock); + } return ret; } @@ -1922,7 +1749,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) struct btree_alloc *a = &c->btree_reserve_cache[--c->btree_reserve_cache_nr]; - bch2_open_bucket_put(c, a->ob); + bch2_open_bucket_put_refs(c, &a->ob.nr, a->ob.refs); } mutex_unlock(&c->btree_reserve_cache_lock); @@ -2013,8 +1840,7 @@ void bch2_fs_allocator_init(struct bch_fs *c) unsigned i; mutex_init(&c->write_points_hash_lock); - init_rwsem(&c->alloc_gc_lock); - spin_lock_init(&c->open_buckets_lock); + spin_lock_init(&c->freelist_lock); bch2_prio_timer_init(c, READ); bch2_prio_timer_init(c, WRITE); diff --git a/fs/bcachefs/alloc.h b/fs/bcachefs/alloc.h index d535e0130995..b442a4ddce7a 100644 --- a/fs/bcachefs/alloc.h +++ b/fs/bcachefs/alloc.h @@ -24,10 +24,47 @@ void bch2_wp_rescale(struct bch_fs *, struct bch_dev *, int bch2_alloc_read(struct bch_fs *, struct list_head *); int bch2_alloc_replay_key(struct bch_fs *, struct bpos); +enum bucket_alloc_ret { + ALLOC_SUCCESS = 0, + OPEN_BUCKETS_EMPTY = -1, + FREELIST_EMPTY = -2, /* Allocator thread not keeping up */ + NO_DEVICES = -3, /* -EROFS */ +}; + int bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum alloc_reserve, bool, - struct open_bucket_ptr *); + struct closure *); + +void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + +static inline void bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) +{ + if (atomic_dec_and_test(&ob->pin)) + __bch2_open_bucket_put(c, ob); +} + +static inline void bch2_open_bucket_put_refs(struct bch_fs *c, u8 *nr, u8 *refs) +{ + unsigned i; -void bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); + for (i = 0; i < *nr; i++) + bch2_open_bucket_put(c, c->open_buckets + refs[i]); + + *nr = 0; +} + +static inline void bch2_open_bucket_get(struct bch_fs *c, + struct write_point *wp, + u8 *nr, u8 *refs) +{ + unsigned i; + + for (i = 0; i < wp->nr_ptrs_can_use; i++) { + struct open_bucket *ob = wp->ptrs[i]; + + atomic_inc(&ob->pin); + refs[(*nr)++] = ob - c->open_buckets; + } +} struct write_point *bch2_alloc_sectors_start(struct bch_fs *, struct bch_devs_mask *, @@ -39,18 +76,9 @@ struct write_point *bch2_alloc_sectors_start(struct bch_fs *, struct closure *); void bch2_alloc_sectors_append_ptrs(struct bch_fs *, struct write_point *, - struct bkey_i_extent *, unsigned, unsigned); + struct bkey_i_extent *, unsigned); void bch2_alloc_sectors_done(struct bch_fs *, struct write_point *); -struct open_bucket *bch2_alloc_sectors(struct bch_fs *, - struct bch_devs_mask *, - struct write_point_specifier, - struct bkey_i_extent *, - unsigned, unsigned, - enum alloc_reserve, - unsigned, - struct closure *); - static inline void bch2_wake_allocator(struct bch_dev *ca) { struct task_struct *p; @@ -61,10 +89,10 @@ static inline void bch2_wake_allocator(struct bch_dev *ca) rcu_read_unlock(); } -#define open_bucket_for_each_ptr(_ob, _ptr) \ - for ((_ptr) = (_ob)->ptrs; \ - (_ptr) < (_ob)->ptrs + (_ob)->nr_ptrs; \ - (_ptr)++) +#define writepoint_for_each_ptr(_wp, _ob, _i) \ + for ((_i) = 0; \ + (_i) < (_wp)->nr_ptrs && ((_ob) = (_wp)->ptrs[_i], true); \ + (_i)++) static inline struct write_point_specifier writepoint_hashed(unsigned long v) { diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index d55d959bd8ce..90123ff7b6d9 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -47,19 +47,14 @@ enum alloc_reserve { #define OPEN_BUCKETS_COUNT 256 #define WRITE_POINT_COUNT 32 -struct open_bucket_ptr { - struct bch_extent_ptr ptr; - unsigned sectors_free; -}; - struct open_bucket { spinlock_t lock; atomic_t pin; u8 freelist; - u8 new_ob; - u8 nr_ptrs; - - struct open_bucket_ptr ptrs[BCH_REPLICAS_MAX * 2]; + bool valid; + bool on_partial_list; + unsigned sectors_free; + struct bch_extent_ptr ptr; }; struct write_point { @@ -69,15 +64,16 @@ struct write_point { unsigned long write_point; enum bch_data_type type; + u8 nr_ptrs; /* * number of pointers in @ob we can't use, because we already had * pointers to those devices: */ - u8 nr_ptrs_have; + u8 nr_ptrs_can_use; /* calculated based on how many pointers we're actually going to use: */ unsigned sectors_free; - struct open_bucket *ob; + struct open_bucket *ptrs[BCH_REPLICAS_MAX * 2]; u64 next_alloc[BCH_SB_MEMBERS_MAX]; }; diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 289193acfe62..4d11651506ad 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -365,7 +365,7 @@ struct bch_dev { unsigned nr_invalidated; bool alloc_thread_started; - struct open_bucket_ptr open_buckets_partial[BCH_REPLICAS_MAX * WRITE_POINT_COUNT]; + u8 open_buckets_partial[OPEN_BUCKETS_COUNT]; unsigned open_buckets_partial_nr; size_t fifo_last_bucket; @@ -533,10 +533,7 @@ struct bch_fs { * when allocating btree reserves fail halfway through) - instead, we * can stick them here: */ - struct btree_alloc { - struct open_bucket *ob; - BKEY_PADDED(k); - } btree_reserve_cache[BTREE_NODE_RESERVE * 2]; + struct btree_alloc btree_reserve_cache[BTREE_NODE_RESERVE * 2]; unsigned btree_reserve_cache_nr; struct mutex btree_reserve_cache_lock; @@ -549,7 +546,6 @@ struct bch_fs { struct workqueue_struct *copygc_wq; /* ALLOCATION */ - struct rw_semaphore alloc_gc_lock; struct delayed_work pd_controllers_update; unsigned pd_controllers_update_seconds; @@ -593,8 +589,8 @@ struct bch_fs { struct io_clock io_clock[2]; - /* SECTOR ALLOCATOR */ - spinlock_t open_buckets_lock; + /* ALLOCATOR */ + spinlock_t freelist_lock; u8 open_buckets_freelist; u8 open_buckets_nr_free; struct closure_waitlist open_buckets_wait; @@ -606,15 +602,6 @@ struct bch_fs { struct hlist_head write_points_hash[WRITE_POINT_COUNT]; struct mutex write_points_hash_lock; - /* - * This write point is used for migrating data off a device - * and can point to any other device. - * We can't use the normal write points because those will - * gang up n replicas, and for migration we want only one new - * replica. - */ - struct write_point migration_write_point; - /* GARBAGE COLLECTION */ struct task_struct *gc_thread; atomic_t kick_gc; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index b09019659ddd..b8a0d618fc0d 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -278,14 +278,12 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) { struct bch_dev *ca; struct open_bucket *ob; - const struct open_bucket_ptr *ptr; size_t i, j, iter; unsigned ci; - down_write(&c->alloc_gc_lock); + spin_lock(&c->freelist_lock); for_each_member_device(ca, c, ci) { - spin_lock(&ca->freelist_lock); fifo_for_each_entry(i, &ca->free_inc, iter) bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); @@ -293,27 +291,20 @@ static void bch2_mark_allocator_buckets(struct bch_fs *c) for (j = 0; j < RESERVE_NR; j++) fifo_for_each_entry(i, &ca->free[j], iter) bch2_mark_alloc_bucket(ca, &ca->buckets[i], true); - - for (ptr = ca->open_buckets_partial; - ptr < ca->open_buckets_partial + ca->open_buckets_partial_nr; - ptr++) - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true); - - spin_unlock(&ca->freelist_lock); } + spin_unlock(&c->freelist_lock); + for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { spin_lock(&ob->lock); - open_bucket_for_each_ptr(ob, ptr) { - ca = c->devs[ptr->ptr.dev]; - bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ptr->ptr), true); + if (ob->valid) { + ca = c->devs[ob->ptr.dev]; + bch2_mark_alloc_bucket(ca, PTR_BUCKET(ca, &ob->ptr), true); } spin_unlock(&ob->lock); } - - up_write(&c->alloc_gc_lock); } static void mark_metadata_sectors(struct bch_dev *ca, u64 start, u64 end, diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 4ff28e744779..f1e06a378c9a 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -55,6 +55,16 @@ struct btree_write { struct closure_waitlist wait; }; +struct btree_ob_ref { + u8 nr; + u8 refs[BCH_REPLICAS_MAX]; +}; + +struct btree_alloc { + struct btree_ob_ref ob; + BKEY_PADDED(k); +}; + struct btree { /* Hottest entries first */ struct rhash_head hash; @@ -118,7 +128,7 @@ struct btree { */ struct btree_update *will_make_reachable; - struct open_bucket *ob; + struct btree_ob_ref ob; /* lru list */ struct list_head list; diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 78a1f7798432..42076dce3757 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -229,7 +229,7 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, BUG_ON(btree_node_dirty(b)); BUG_ON(btree_node_need_write(b)); BUG_ON(b == btree_node_root(c, b)); - BUG_ON(b->ob); + BUG_ON(b->ob.nr); BUG_ON(!list_empty(&b->write_blocked)); BUG_ON(b->will_make_reachable); @@ -254,17 +254,17 @@ static void __btree_node_free(struct bch_fs *c, struct btree *b, void bch2_btree_node_free_never_inserted(struct bch_fs *c, struct btree *b) { - struct open_bucket *ob = b->ob; + struct btree_ob_ref ob = b->ob; btree_update_drop_new_node(c, b); - b->ob = NULL; + b->ob.nr = 0; clear_btree_node_dirty(b); __btree_node_free(c, b, NULL); - bch2_open_bucket_put(c, ob); + bch2_open_bucket_put_refs(c, &ob.nr, ob.refs); } void bch2_btree_node_free_inmem(struct bch_fs *c, struct btree *b, @@ -296,8 +296,7 @@ static void bch2_btree_node_free_ondisk(struct bch_fs *c, void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *b) { - bch2_open_bucket_put(c, b->ob); - b->ob = NULL; + bch2_open_bucket_put_refs(c, &b->ob.nr, b->ob.refs); } static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, @@ -305,9 +304,12 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, struct closure *cl, unsigned flags) { - BKEY_PADDED(k) tmp; - struct open_bucket *ob; + struct write_point *wp; struct btree *b; + BKEY_PADDED(k) tmp; + struct bkey_i_extent *e; + struct btree_ob_ref ob; + struct bch_devs_list devs_have = (struct bch_devs_list) { 0 }; unsigned nr_reserve; enum alloc_reserve alloc_reserve; @@ -335,32 +337,41 @@ static struct btree *__bch2_btree_node_alloc(struct bch_fs *c, mutex_unlock(&c->btree_reserve_cache_lock); retry: - /* alloc_sectors is weird, I suppose */ - bkey_extent_init(&tmp.k); - tmp.k.k.size = c->opts.btree_node_size, - - ob = bch2_alloc_sectors(c, NULL, - writepoint_ptr(&c->btree_write_point), - bkey_i_to_extent(&tmp.k), - res->nr_replicas, - c->opts.metadata_replicas_required, - alloc_reserve, 0, cl); - if (IS_ERR(ob)) - return ERR_CAST(ob); - - if (tmp.k.k.size < c->opts.btree_node_size) { - bch2_open_bucket_put(c, ob); + wp = bch2_alloc_sectors_start(c, NULL, + writepoint_ptr(&c->btree_write_point), + &devs_have, + res->nr_replicas, + c->opts.metadata_replicas_required, + alloc_reserve, 0, cl); + if (IS_ERR(wp)) + return ERR_CAST(wp); + + if (wp->sectors_free < c->opts.btree_node_size) { + struct open_bucket *ob; + unsigned i; + + writepoint_for_each_ptr(wp, ob, i) + if (ob->sectors_free < c->opts.btree_node_size) + ob->sectors_free = 0; + + bch2_alloc_sectors_done(c, wp); goto retry; } + + e = bkey_extent_init(&tmp.k); + bch2_alloc_sectors_append_ptrs(c, wp, e, c->opts.btree_node_size); + + ob.nr = 0; + bch2_open_bucket_get(c, wp, &ob.nr, ob.refs); + bch2_alloc_sectors_done(c, wp); mem_alloc: b = bch2_btree_node_mem_alloc(c); /* we hold cannibalize_lock: */ BUG_ON(IS_ERR(b)); - BUG_ON(b->ob); + BUG_ON(b->ob.nr); bkey_copy(&b->key, &tmp.k); - b->key.k.size = 0; b->ob = ob; return b; @@ -467,11 +478,10 @@ static void bch2_btree_reserve_put(struct bch_fs *c, struct btree_reserve *reser &c->btree_reserve_cache[c->btree_reserve_cache_nr++]; a->ob = b->ob; - b->ob = NULL; + b->ob.nr = 0; bkey_copy(&a->k, &b->key); } else { - bch2_open_bucket_put(c, b->ob); - b->ob = NULL; + bch2_btree_open_bucket_put(c, b); } __btree_node_free(c, b, NULL); diff --git a/fs/bcachefs/io.c b/fs/bcachefs/io.c index 8d0b421495eb..f88650b6af8e 100644 --- a/fs/bcachefs/io.c +++ b/fs/bcachefs/io.c @@ -244,7 +244,6 @@ static void bch2_write_index(struct closure *cl) struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; - unsigned i; op->flags |= BCH_WRITE_LOOPED; @@ -262,13 +261,7 @@ static void bch2_write_index(struct closure *cl) } } - for (i = 0; i < ARRAY_SIZE(op->open_buckets); i++) - if (op->open_buckets[i]) { - bch2_open_bucket_put(c, - c->open_buckets + - op->open_buckets[i]); - op->open_buckets[i] = 0; - } + bch2_open_bucket_put_refs(c, &op->open_buckets_nr, op->open_buckets); if (!(op->flags & BCH_WRITE_DONE)) { continue_at(cl, __bch2_write, op->io_wq); @@ -367,8 +360,7 @@ static void init_append_extent(struct bch_write_op *op, bkey_extent_set_cached(&e->k, op->flags & BCH_WRITE_CACHED); bch2_extent_crc_append(e, crc); - bch2_alloc_sectors_append_ptrs(op->c, wp, e, op->nr_replicas, - crc.compressed_size); + bch2_alloc_sectors_append_ptrs(op->c, wp, e, crc.compressed_size); bkey_extent_set_cached(&e->k, (op->flags & BCH_WRITE_CACHED)); bch2_keylist_push(&op->insert_keys); @@ -746,13 +738,12 @@ static void __bch2_write(struct closure *cl) { struct bch_write_op *op = container_of(cl, struct bch_write_op, cl); struct bch_fs *c = op->c; - unsigned open_bucket_nr = 0; struct write_point *wp; - struct open_bucket *ob; int ret; do { - if (open_bucket_nr == ARRAY_SIZE(op->open_buckets)) { + if (op->open_buckets_nr + op->nr_replicas > + ARRAY_SIZE(op->open_buckets)) { continue_at(cl, bch2_write_index, index_update_wq(op)); return; } @@ -816,14 +807,13 @@ static void __bch2_write(struct closure *cl) continue; } - ob = wp->ob; - - BUG_ON(ob - c->open_buckets == 0 || - ob - c->open_buckets > U8_MAX); - op->open_buckets[open_bucket_nr++] = ob - c->open_buckets; - ret = bch2_write_extent(op, wp); + BUG_ON(op->open_buckets_nr + wp->nr_ptrs_can_use > + ARRAY_SIZE(op->open_buckets)); + bch2_open_bucket_get(c, wp, + &op->open_buckets_nr, + op->open_buckets); bch2_alloc_sectors_done(c, wp); if (ret < 0) @@ -881,7 +871,6 @@ void bch2_write(struct closure *cl) BUG_ON(!bkey_cmp(op->pos, POS_MAX)); BUG_ON(bio_sectors(&op->wbio.bio) > U16_MAX); - memset(op->open_buckets, 0, sizeof(op->open_buckets)); memset(&op->failed, 0, sizeof(op->failed)); bch2_keylist_init(&op->insert_keys, diff --git a/fs/bcachefs/io.h b/fs/bcachefs/io.h index 8e1a5d74c487..bd0d7c43c7a1 100644 --- a/fs/bcachefs/io.h +++ b/fs/bcachefs/io.h @@ -53,25 +53,26 @@ int bch2_write_index_default(struct bch_write_op *); static inline void __bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c) { - op->c = c; - op->io_wq = index_update_wq(op); - op->flags = 0; - op->written = 0; - op->error = 0; - op->csum_type = bch2_data_checksum_type(c); - op->compression_type = + op->c = c; + op->io_wq = index_update_wq(op); + op->flags = 0; + op->written = 0; + op->error = 0; + op->csum_type = bch2_data_checksum_type(c); + op->compression_type = bch2_compression_opt_to_type(c->opts.compression); - op->nr_replicas = 0; + op->nr_replicas = 0; op->nr_replicas_required = c->opts.data_replicas_required; - op->alloc_reserve = RESERVE_NONE; - op->devs_have = (struct bch_devs_list) { 0 }; - op->pos = POS_MAX; - op->version = ZERO_VERSION; - op->res = (struct disk_reservation) { 0 }; - op->devs = NULL; - op->write_point = (struct write_point_specifier) { 0 }; - op->journal_seq = 0; - op->index_update_fn = bch2_write_index_default; + op->alloc_reserve = RESERVE_NONE; + op->open_buckets_nr = 0; + op->devs_have.nr = 0; + op->pos = POS_MAX; + op->version = ZERO_VERSION; + op->devs = NULL; + op->write_point = (struct write_point_specifier) { 0 }; + op->res = (struct disk_reservation) { 0 }; + op->journal_seq = 0; + op->index_update_fn = bch2_write_index_default; } static inline void bch2_write_op_init(struct bch_write_op *op, struct bch_fs *c, diff --git a/fs/bcachefs/io_types.h b/fs/bcachefs/io_types.h index d892172ac7ae..ed9a4bbe3929 100644 --- a/fs/bcachefs/io_types.h +++ b/fs/bcachefs/io_types.h @@ -100,6 +100,7 @@ struct bch_write_op { unsigned nr_replicas_required:4; unsigned alloc_reserve:4; + u8 open_buckets_nr; struct bch_devs_list devs_have; u16 target; u16 nonce; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 78a17330f453..5c9314e13bf8 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1570,27 +1570,24 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, memcpy(new_bucket_seq, ja->bucket_seq, ja->nr * sizeof(u64)); swap(new_buckets, ja->buckets); swap(new_bucket_seq, ja->bucket_seq); + spin_unlock(&j->lock); while (ja->nr < nr) { - struct open_bucket_ptr ptr; - size_t b; + struct open_bucket *ob; + size_t bucket; + int ob_idx; - /* must happen under journal lock, to avoid racing with gc: */ - if (bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &ptr) < 0) { - if (!closure_wait(&c->freelist_wait, &cl)) { - spin_unlock(&j->lock); + ob_idx = bch2_bucket_alloc(c, ca, RESERVE_ALLOC, false, &cl); + if (ob_idx < 0) { + if (!closure_wait(&c->freelist_wait, &cl)) closure_sync(&cl); - spin_lock(&j->lock); - } continue; } - b = sector_to_bucket(ca, ptr.ptr.offset); - - bch2_mark_metadata_bucket(ca, &ca->buckets[b], - BUCKET_JOURNAL, false); - bch2_mark_alloc_bucket(ca, &ca->buckets[b], false); + ob = c->open_buckets + ob_idx; + bucket = sector_to_bucket(ca, ob->ptr.offset); + spin_lock(&j->lock); memmove(ja->buckets + ja->last_idx + 1, ja->buckets + ja->last_idx, (ja->nr - ja->last_idx) * sizeof(u64)); @@ -1601,8 +1598,8 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, journal_buckets->buckets + ja->last_idx, (ja->nr - ja->last_idx) * sizeof(u64)); - ja->buckets[ja->last_idx] = b; - journal_buckets->buckets[ja->last_idx] = cpu_to_le64(b); + ja->buckets[ja->last_idx] = bucket; + journal_buckets->buckets[ja->last_idx] = cpu_to_le64(bucket); if (ja->last_idx < ja->nr) { if (ja->cur_idx >= ja->last_idx) @@ -1611,8 +1608,12 @@ static int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, } ja->nr++; + bch2_mark_metadata_bucket(ca, &ca->buckets[bucket], + BUCKET_JOURNAL, false); + spin_unlock(&j->lock); + + bch2_open_bucket_put(c, ob); } - spin_unlock(&j->lock); BUG_ON(bch2_sb_validate_journal(ca->disk_sb.sb, ca->mi)); diff --git a/include/trace/events/bcachefs.h b/include/trace/events/bcachefs.h index 6b897d88c186..94c902a7df8b 100644 --- a/include/trace/events/bcachefs.h +++ b/include/trace/events/bcachefs.h @@ -474,53 +474,9 @@ DEFINE_EVENT(bucket_alloc, bucket_alloc_fail, TP_ARGS(ca, reserve) ); -TRACE_EVENT(freelist_empty_fail, - TP_PROTO(struct bch_fs *c, enum alloc_reserve reserve, - struct closure *cl), - TP_ARGS(c, reserve, cl), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(enum alloc_reserve, reserve ) - __field(struct closure *, cl ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->reserve = reserve; - __entry->cl = cl; - ), - - TP_printk("%pU reserve %d cl %p", __entry->uuid, __entry->reserve, - __entry->cl) -); - -DECLARE_EVENT_CLASS(open_bucket_alloc, - TP_PROTO(struct bch_fs *c, struct closure *cl), - TP_ARGS(c, cl), - - TP_STRUCT__entry( - __array(char, uuid, 16 ) - __field(struct closure *, cl ) - ), - - TP_fast_assign( - memcpy(__entry->uuid, c->sb.user_uuid.b, 16); - __entry->cl = cl; - ), - - TP_printk("%pU cl %p", - __entry->uuid, __entry->cl) -); - -DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc, - TP_PROTO(struct bch_fs *c, struct closure *cl), - TP_ARGS(c, cl) -); - -DEFINE_EVENT(open_bucket_alloc, open_bucket_alloc_fail, - TP_PROTO(struct bch_fs *c, struct closure *cl), - TP_ARGS(c, cl) +DEFINE_EVENT(bucket_alloc, open_bucket_alloc_fail, + TP_PROTO(struct bch_dev *ca, enum alloc_reserve reserve), + TP_ARGS(ca, reserve) ); /* Moving IO */ |