diff options
-rw-r--r-- | fs/bcachefs/Makefile | 6 | ||||
-rw-r--r-- | fs/bcachefs/alloc.c | 13 | ||||
-rw-r--r-- | fs/bcachefs/alloc_types.h | 2 | ||||
-rw-r--r-- | fs/bcachefs/bcache.h | 51 | ||||
-rw-r--r-- | fs/bcachefs/btree_gc.c | 18 | ||||
-rw-r--r-- | fs/bcachefs/buckets.h | 8 | ||||
-rw-r--r-- | fs/bcachefs/buckets_types.h | 3 | ||||
-rw-r--r-- | fs/bcachefs/extents.c | 8 | ||||
-rw-r--r-- | fs/bcachefs/keylist.c | 190 | ||||
-rw-r--r-- | fs/bcachefs/keylist.h | 44 | ||||
-rw-r--r-- | fs/bcachefs/keylist_types.h | 45 | ||||
-rw-r--r-- | fs/bcachefs/migrate.c | 166 | ||||
-rw-r--r-- | fs/bcachefs/move.c | 581 | ||||
-rw-r--r-- | fs/bcachefs/move.h | 149 | ||||
-rw-r--r-- | fs/bcachefs/move_types.h | 65 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.c | 367 | ||||
-rw-r--r-- | fs/bcachefs/movinggc.h | 9 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.c | 467 | ||||
-rw-r--r-- | fs/bcachefs/rebalance.h | 7 | ||||
-rw-r--r-- | fs/bcachefs/super.c | 80 | ||||
-rw-r--r-- | fs/bcachefs/sysfs.c | 34 | ||||
-rw-r--r-- | fs/bcachefs/tier.c | 466 | ||||
-rw-r--r-- | fs/bcachefs/tier.h | 12 | ||||
-rw-r--r-- | include/trace/events/bcache.h | 2 |
24 files changed, 633 insertions, 2160 deletions
diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index 390c167819c4..5a688e85732c 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -5,6 +5,6 @@ bcache-y := acl.o alloc.o bkey.o bkey_methods.o blockdev.o\ bset.o btree_cache.o btree_gc.o btree_io.o btree_iter.o btree_update.o\ buckets.o chardev.o checksum.o clock.o closure.o debug.o dirent.o\ error.o extents.o fs.o fs-gc.o fs-io.o inode.o io.o journal.o keybuf.o\ - keylist.o migrate.o move.o movinggc.o notify.o opts.o request.o\ - siphash.o six.o stats.o super.o sysfs.o tier.o trace.o util.o\ - writeback.o xattr.o + keylist.o migrate.o move.o notify.o opts.o rebalance.o request.o\ + siphash.o six.o stats.o super.o sysfs.o trace.o util.o writeback.o\ + xattr.o diff --git a/fs/bcachefs/alloc.c b/fs/bcachefs/alloc.c index 1d1d302c84a1..3485019c535a 100644 --- a/fs/bcachefs/alloc.c +++ b/fs/bcachefs/alloc.c @@ -148,10 +148,10 @@ static void pd_controllers_update(struct work_struct *work) if (fragmented < 0) fragmented = 0; - +#if 0 bch_pd_controller_update(&ca->moving_gc_pd, free, fragmented, -1); - +#endif if (i == 0) tier0_can_free += fragmented; @@ -165,11 +165,12 @@ static void pd_controllers_update(struct work_struct *work) u64 target = div_u64(tier_size[0] * c->tiering_percent, 100); tier0_can_free = max_t(s64, 0, tier_dirty[0] - target); - +#if 0 bch_pd_controller_update(&c->tiering_pd, target, tier_dirty[0], -1); +#endif } /* @@ -579,7 +580,6 @@ static void __bch_invalidate_one_bucket(struct cache *ca, struct bucket *g) g->read_prio = ca->set->prio_clock[READ].hand; g->write_prio = ca->set->prio_clock[WRITE].hand; - g->copygc_gen = 0; verify_not_on_freelist(ca, g - ca->buckets); } @@ -1643,11 +1643,10 @@ void bch_cache_allocator_stop(struct cache *ca) for (i = 0; i < ARRAY_SIZE(c->write_points); i++) bch_stop_write_point(ca, &c->write_points[i]); - for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++) - bch_stop_write_point(ca, &ca->gc_buckets[i]); + for (i = 0; i < ARRAY_SIZE(c->rebalance); i++) + bch_stop_write_point(ca, &c->rebalance[i].wp); bch_stop_write_point(ca, &c->promote_write_point); - bch_stop_write_point(ca, &ca->tiering_write_point); bch_stop_write_point(ca, &c->migration_write_point); bch_stop_write_point(ca, &c->btree_write_point); diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 065b9c02f185..1372fc26ccb1 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -38,7 +38,7 @@ enum alloc_reserve { RESERVE_PRIO, RESERVE_BTREE, RESERVE_METADATA_LAST = RESERVE_BTREE, - RESERVE_MOVINGGC, + RESERVE_MOVINGGC, /* hrm */ RESERVE_NONE, RESERVE_NR, diff --git a/fs/bcachefs/bcache.h b/fs/bcachefs/bcache.h index 7f856f7fd1aa..a6bbd38f4316 100644 --- a/fs/bcachefs/bcache.h +++ b/fs/bcachefs/bcache.h @@ -278,7 +278,6 @@ #include "journal_types.h" #include "keylist_types.h" #include "keybuf_types.h" -#include "move_types.h" #include "stats_types.h" #include "super_types.h" @@ -356,8 +355,6 @@ struct cache { struct cache_set *set; - struct cache_group self; - /* * Cached version of this device's member info from superblock * Committed by write_super() @@ -433,25 +430,6 @@ struct cache { struct mutex heap_lock; DECLARE_HEAP(struct bucket_heap_entry, heap); - /* Moving GC: */ - struct task_struct *moving_gc_read; - - struct moving_queue moving_gc_queue; - struct bch_pd_controller moving_gc_pd; - - /* Tiering: */ - struct moving_queue tiering_queue; - struct write_point tiering_write_point; - unsigned tiering_stripe_size; - - /* - * open buckets used in moving garbage collection - * NOTE: GC_GEN == 0 signifies no moving gc, so accessing the - * gc_buckets array is always GC_GEN-1. - */ -#define NUM_GC_GENS 8 - struct write_point gc_buckets[NUM_GC_GENS]; - struct journal_device journal; struct work_struct io_error_work; @@ -504,6 +482,26 @@ struct btree_debug { struct dentry *btree_format; }; +struct rebalance_bucket_entry { + size_t bucket; + u8 dev; + u8 gen; + unsigned sectors; +}; + +struct rebalance_thread { + unsigned tier; + unsigned initialized; + struct task_struct *p; + struct bch_pd_controller pd; + struct write_point wp; + + struct workqueue_struct *wq; + + struct mutex heap_lock; + DECLARE_HEAP(struct rebalance_bucket_entry, heap); +}; + struct cache_set { struct closure cl; @@ -702,10 +700,6 @@ struct cache_set { struct task_struct *gc_thread; atomic_t kick_gc; - /* This is a list of scan_keylists for btree GC to scan */ - struct list_head gc_scan_keylists; - struct mutex gc_scan_keylist_lock; - /* * Tracks GC's progress - everything in the range [ZERO_KEY..gc_cur_pos] * has been marked by GC. @@ -751,9 +745,8 @@ struct cache_set { /* FILESYSTEM */ atomic_long_t nr_inodes; - /* TIERING */ - struct task_struct *tiering_read; - struct bch_pd_controller tiering_pd; + /* REBALANCE */ + struct rebalance_thread rebalance[CACHE_TIERS]; /* NOTIFICATIONS */ struct mutex uevent_lock; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 965b4a58ba9a..65222eb0b0cb 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -301,23 +301,6 @@ static void bch_mark_pending_btree_node_frees(struct cache_set *c) mutex_unlock(&c->btree_interior_update_lock); } -static void bch_mark_scan_keylists(struct cache_set *c) -{ - struct scan_keylist *kl; - - mutex_lock(&c->gc_scan_keylist_lock); - - /* What the goddamn fuck? */ - list_for_each_entry(kl, &c->gc_scan_keylists, mark_list) { - if (kl->owner == NULL) - bch_keylist_recalc_oldest_gens(c, kl); - else - bch_queue_recalc_oldest_gens(c, kl->owner); - } - - mutex_unlock(&c->gc_scan_keylist_lock); -} - /** * bch_gc - recompute bucket marks and oldest_gen, rewrite btree nodes */ @@ -419,7 +402,6 @@ void bch_gc(struct cache_set *c) bch_mark_metadata(c); bch_mark_pending_btree_node_frees(c); bch_writeback_recalc_oldest_gens(c); - bch_mark_scan_keylists(c); for_each_cache(ca, c, i) atomic_long_set(&ca->saturated_count, 0); diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index e878ac09a0f2..be225cb850c7 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -33,6 +33,14 @@ static inline struct cache *PTR_CACHE(const struct cache_set *c, return rcu_dereference(c->cache[ptr->dev]); } +static inline unsigned PTR_TIER(const struct cache_member_rcu *mi, + const struct bch_extent_ptr *ptr) +{ + return ptr->dev < mi->nr_in_set + ? mi->m[ptr->dev].tier + : UINT_MAX; +} + static inline size_t PTR_BUCKET_NR(const struct cache *ca, const struct bch_extent_ptr *ptr) { diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index a1914404531e..78cdcafcf155 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -42,9 +42,6 @@ struct bucket { struct bucket_mark mark; /* Most out of date gen in the btree */ u8 oldest_gen; - - /* generation copygc is going to move this bucket into */ - u8 copygc_gen; }; struct bucket_stats_cache { diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index d041f0cfbdc7..6890bcc77dae 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -1649,14 +1649,6 @@ static void bch_extent_to_text(struct cache_set *c, char *buf, #undef p } -static unsigned PTR_TIER(struct cache_member_rcu *mi, - const struct bch_extent_ptr *ptr) -{ - return ptr->dev < mi->nr_in_set - ? mi->m[ptr->dev].tier - : UINT_MAX; -} - void bch_extent_entry_append(struct bkey_i_extent *e, union bch_extent_entry *entry) { diff --git a/fs/bcachefs/keylist.c b/fs/bcachefs/keylist.c index 638596300575..644734b1d4f2 100644 --- a/fs/bcachefs/keylist.c +++ b/fs/bcachefs/keylist.c @@ -117,193 +117,3 @@ void bch_keylist_add_in_order(struct keylist *l, struct bkey_i *insert) BUG_ON(l->top_p > l->end_keys_p); bkey_copy(where, insert); } - -/* Scan keylists simple utilities */ - -void bch_scan_keylist_init(struct scan_keylist *kl, - struct cache_set *c, - unsigned max_size) - -{ - kl->c = c; - kl->owner = NULL; - - mutex_init(&kl->lock); - kl->max_size = max_size; - bch_keylist_init(&kl->list, NULL, 0); - - /* - * Order of initialization is tricky, and this makes sure that - * we have a valid cache set in case the order of - * initialization chages and breaks things. - */ - BUG_ON(c == NULL); - mutex_lock(&c->gc_scan_keylist_lock); - list_add_tail(&kl->mark_list, &c->gc_scan_keylists); - mutex_unlock(&c->gc_scan_keylist_lock); -} - -void bch_scan_keylist_destroy(struct scan_keylist *kl) -{ - if (kl->c) { - mutex_lock(&kl->c->gc_scan_keylist_lock); - list_del(&kl->mark_list); - mutex_unlock(&kl->c->gc_scan_keylist_lock); - } - - mutex_lock(&kl->lock); - bch_keylist_free(&kl->list); - mutex_unlock(&kl->lock); -} - -void bch_scan_keylist_reset(struct scan_keylist *kl) -{ - mutex_lock(&kl->lock); - kl->list.bot_p = kl->list.top_p = kl->list.start_keys_p; - mutex_unlock(&kl->lock); -} - -/* - * This should only be called from sysfs, and holding a lock that prevents - * re-entrancy. - */ -void bch_scan_keylist_resize(struct scan_keylist *kl, - unsigned max_size) -{ - mutex_lock(&kl->lock); - kl->max_size = max_size; /* May be smaller than current size */ - mutex_unlock(&kl->lock); -} - -/** - * bch_keylist_recalc_oldest_gens - update oldest_gen pointers from keylist keys - * - * This prevents us from wrapping around gens for a bucket only referenced from - * the tiering or moving GC keylists. We don't actually care that the data in - * those buckets is marked live, only that we don't wrap the gens. - * - * Note: This interlocks with insertions, but not all dequeues interlock. - * The particular case in which dequeues don't interlock is when a - * scan list used by the copy offload ioctls is used as a plain - * keylist for btree insertion. - * The btree insertion code doesn't go through - * bch_scan_keylist_dequeue below, and instead uses plain - * bch_keylist_dequeue. The other pointers (top, start, end) are - * unchanged in this case. - * A little care with the bottomp pointer suffices in this case. - * Of course, we may end up marking stuff that we don't need to mark, - * but was recently valid and we have likely just inserted in the tree - * anyway. - */ -void bch_keylist_recalc_oldest_gens(struct cache_set *c, - struct scan_keylist *kl) -{ - struct bkey_i *k; - - mutex_lock(&kl->lock); - - for_each_keylist_key(&kl->list, k) - bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(k)); - - mutex_unlock(&kl->lock); -} - -int bch_scan_keylist_add(struct scan_keylist *kl, struct bkey_s_c k) -{ - int ret; - - mutex_lock(&kl->lock); - ret = bch_keylist_realloc_max(&kl->list, - k.k->u64s, - kl->max_size); - - if (!ret) { - bkey_reassemble(kl->list.top, k); - bch_keylist_enqueue(&kl->list); - atomic64_add(k.k->size, &kl->sectors); - } - mutex_unlock(&kl->lock); - - return ret; -} - -/* Actual scanning functionality of scan_keylists */ - -static void bch_refill_scan_keylist(struct cache_set *c, - struct scan_keylist *kl, - struct bpos *last_scanned, - struct bpos end, - scan_keylist_pred_fn *pred) -{ - struct bpos start = *last_scanned; - struct btree_iter iter; - struct bkey_s_c k; - unsigned nr_found = 0; - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, *last_scanned, k) { - if (bkey_cmp(k.k->p, end) >= 0) { - *last_scanned = k.k->p; - goto done; - } - - if (pred(kl, k)) { - if (bch_scan_keylist_add(kl, k)) - goto done; - - nr_found++; - } - - *last_scanned = k.k->p; - bch_btree_iter_cond_resched(&iter); - } - - /* If we end up here, it means: - * - the map_fn didn't fill up the keybuf - * - the map_fn didn't see the end key - * - there were no more keys to map over - * Therefore, we are at the end of the key space */ - *last_scanned = POS_MAX; -done: - bch_btree_iter_unlock(&iter); - - trace_bcache_keyscan(nr_found, - start.inode, start.offset, - last_scanned->inode, - last_scanned->offset); -} - -struct bkey_i *bch_scan_keylist_next(struct scan_keylist *kl) -{ - if (bch_keylist_empty(&kl->list)) - return NULL; - - return bch_keylist_front(&kl->list); -} - -struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c, - struct scan_keylist *kl, - struct bpos *last_scanned, - struct bpos end, - scan_keylist_pred_fn *pred) -{ - if (bch_keylist_empty(&kl->list)) { - if (bkey_cmp(*last_scanned, end) >= 0) - return NULL; - - bch_refill_scan_keylist(c, kl, last_scanned, end, pred); - } - - return bch_scan_keylist_next(kl); -} - -void bch_scan_keylist_dequeue(struct scan_keylist *kl) -{ - u64 sectors; - - mutex_lock(&kl->lock); - sectors = kl->list.bot->k.size; - bch_keylist_dequeue(&kl->list); - mutex_unlock(&kl->lock); - - BUG_ON(atomic64_sub_return(sectors, &kl->sectors) < 0); -} diff --git a/fs/bcachefs/keylist.h b/fs/bcachefs/keylist.h index 028552757527..8fc92986f22f 100644 --- a/fs/bcachefs/keylist.h +++ b/fs/bcachefs/keylist.h @@ -116,49 +116,5 @@ void bch_keylist_add_in_order(struct keylist *, struct bkey_i *); int bch_keylist_realloc(struct keylist *, unsigned); int bch_keylist_realloc_max(struct keylist *, unsigned, unsigned); -void bch_scan_keylist_init(struct scan_keylist *kl, - struct cache_set *c, - unsigned max_size); - -void bch_scan_keylist_reset(struct scan_keylist *kl); - -/* The keylist is dynamically adjusted. This just clamps the maxima */ - -static inline unsigned bch_scan_keylist_size(struct scan_keylist *kl) -{ - return kl->max_size; -} - -static inline u64 bch_scan_keylist_sectors(struct scan_keylist *kl) -{ - return atomic64_read(&kl->sectors); -} - -void bch_scan_keylist_resize(struct scan_keylist *kl, - unsigned max_size); - -void bch_scan_keylist_destroy(struct scan_keylist *kl); - -/* - * IMPORTANT: The caller of bch_scan_keylist_next or - * bch_scan_keylist_next_rescan needs to copy any - * non-null return value before calling either again! - * These functions return a pointer into the internal structure. - * Furthermore, they need to call bch_scan_keylist_advance after - * copying the structure. - */ - -struct bkey_i *bch_scan_keylist_next(struct scan_keylist *); - -struct bkey_i *bch_scan_keylist_next_rescan(struct cache_set *c, - struct scan_keylist *kl, - struct bpos *last_scanned, - struct bpos end, - scan_keylist_pred_fn *pred); - -int bch_scan_keylist_add(struct scan_keylist *, struct bkey_s_c); -void bch_scan_keylist_dequeue(struct scan_keylist *); - -void bch_keylist_recalc_oldest_gens(struct cache_set *, struct scan_keylist *); #endif /* _BCACHE_KEYLIST_H */ diff --git a/fs/bcachefs/keylist_types.h b/fs/bcachefs/keylist_types.h index 569cdc2480e2..156fbe0745fd 100644 --- a/fs/bcachefs/keylist_types.h +++ b/fs/bcachefs/keylist_types.h @@ -48,49 +48,4 @@ struct keylist { bool has_buf; }; -/* - * scan_keylists are conceptually similar to keybufs, but they don't - * have an internal RB tree. - * keybufs should be used when read or write operations need to - * examine keys in flight, as for writeback. - * But for moving operations (moving gc, tiering, moving data off - * devices), read and writes don't need to look at all, so we don't - * need the RB tree and use scan_keylists instead. - * - * Note that unlike keybufs, they don't contain a semaphore to limit - * bios. That must be done externally, if necessary. - */ - -#define DFLT_SCAN_KEYLIST_MAX_SIZE 512 - -struct scan_keylist { - struct list_head mark_list; /* For GC marking */ - - struct cache_set *c; /* For destroying */ - - /* - * Only one thread is allowed to mutate the keylist. Other threads can - * read it. The mutex has to be taken by the mutator thread when - * mutating the keylist, and by other threads when reading, but not by - * the mutator thread when reading. - */ - struct mutex lock; - /* - * Maximum size, in u64s. The keylist will not grow beyond this size. - */ - unsigned max_size; - /* - * Number of sectors in keys currently on the keylist. - */ - atomic64_t sectors; - /* - * The underlying keylist. - */ - struct keylist list; - - struct moving_queue *owner; -}; - -typedef bool (scan_keylist_pred_fn)(struct scan_keylist *, struct bkey_s_c); - #endif /* _BCACHE_KEYLIST_TYPES_H */ diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index c33606865eb2..aa9e0dd80227 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -12,39 +12,30 @@ #include "migrate.h" #include "move.h" -static bool migrate_data_pred(struct scan_keylist *kl, struct bkey_s_c k) -{ - struct cache *ca = container_of(kl, struct cache, - moving_gc_queue.keys); - - return bkey_extent_is_data(k.k) && - bch_extent_has_device(bkey_s_c_to_extent(k), - ca->sb.nr_this_dev); -} - static void bch_extent_drop_dev_ptrs(struct bkey_s_extent e, unsigned dev) { struct bch_extent_ptr *ptr; + unsigned dropped = 0; extent_for_each_ptr_backwards(e, ptr) - if (ptr->dev == dev) + if (ptr->dev == dev) { bch_extent_drop_ptr(e, ptr); + dropped++; + } + + BUG_ON(dropped > 1); } -static int issue_migration_move(struct cache *ca, - struct moving_context *ctxt, - struct bkey_s_c k, - u64 *seen_key_count) +static int migrate_extent(struct cache_set *c, struct cache *ca, + struct bkey_s_c k, struct move_context *m) { - struct moving_queue *q = &ca->moving_gc_queue; - struct cache_set *c = ca->set; struct moving_io *io; struct disk_reservation res; if (bch_disk_reservation_get(c, &res, k.k->size, 0)) return -ENOSPC; - io = moving_io_alloc(k); + io = bch_moving_io_alloc(k); if (!io) { bch_disk_reservation_put(c, &res); return -ENOMEM; @@ -60,33 +51,14 @@ static int issue_migration_move(struct cache *ca, 0); io->op.nr_replicas = 1; - io->op.io_wq = q->wq; - bch_extent_drop_dev_ptrs(bkey_i_to_s_extent(&io->op.insert_key), ca->sb.nr_this_dev); - bch_data_move(q, ctxt, io); - (*seen_key_count)++; - - /* - * IMPORTANT: We must call bch_data_move before we dequeue so - * that the key can always be found in either the pending list - * in the moving queue or in the scan keylist list in the - * moving queue. - * If we reorder, there is a window where a key is not found - * by btree gc marking. - */ - bch_scan_keylist_dequeue(&q->keys); + bch_data_move(m, io); return 0; } -#define MIGRATION_DEBUG 0 - #define MAX_DATA_OFF_ITER 10 -#define PASS_LOW_LIMIT (MIGRATION_DEBUG ? 0 : 2) -#define MIGRATE_NR 64 -#define MIGRATE_READ_NR 32 -#define MIGRATE_WRITE_NR 32 /* * This moves only the data off, leaving the meta-data (if any) in place. @@ -104,37 +76,9 @@ static int issue_migration_move(struct cache *ca, int bch_move_data_off_device(struct cache *ca) { - int ret; - struct bkey_i *k; - unsigned pass; - u64 seen_key_count; - unsigned last_error_count; - unsigned last_error_flags; - struct moving_context context; struct cache_set *c = ca->set; - struct moving_queue *queue = &ca->moving_gc_queue; - - /* - * This reuses the moving gc queue as it is no longer in use - * by moving gc, which must have been stopped to call this. - */ - - BUG_ON(ca->moving_gc_read != NULL); - - /* - * This may actually need to start the work queue because the - * device may have always been read-only and never have had it - * started (moving gc usually starts it but not for RO - * devices). - */ - - bch_queue_start(queue); - - queue_io_resize(queue, MIGRATE_NR, MIGRATE_READ_NR, MIGRATE_WRITE_NR); - - BUG_ON(queue->wq == NULL); - bch_moving_context_init(&context, NULL, MOVING_PURPOSE_MIGRATION); - context.avoid = ca; + u64 seen_key_count = 1; + unsigned pass; /* * In theory, only one pass should be necessary as we've @@ -153,82 +97,44 @@ int bch_move_data_off_device(struct cache *ca) * but that can be viewed as a verification pass. */ - seen_key_count = 1; - last_error_count = 0; - last_error_flags = 0; - for (pass = 0; (seen_key_count != 0 && (pass < MAX_DATA_OFF_ITER)); pass++) { - bool again; - - seen_key_count = 0; - atomic_set(&context.error_count, 0); - atomic_set(&context.error_flags, 0); - context.last_scanned = POS_MIN; - -again: - again = false; - - while (1) { - if (bch_queue_full(queue)) { - if (queue->rotational) { - again = true; - break; - } else { - bch_moving_wait(&context); - continue; - } - } + struct btree_iter iter; + struct bkey_s_c k; + struct move_context m; - k = bch_scan_keylist_next_rescan(c, - &queue->keys, - &context.last_scanned, - POS_MAX, - migrate_data_pred); - if (k == NULL) - break; + move_context_init(&m); - if (issue_migration_move(ca, &context, bkey_i_to_s_c(k), - &seen_key_count)) { - /* - * Memory allocation failed; we will wait for - * all queued moves to finish and continue - * scanning starting from the same key - */ - again = true; - break; - } - } - - bch_queue_run(queue, &context); - if (again) - goto again; + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) { + if (bkey_extent_is_data(k.k) && + bch_extent_has_device(bkey_s_c_to_extent(k), + ca->sb.nr_this_dev)) { + BKEY_PADDED(k) tmp; - if ((pass >= PASS_LOW_LIMIT) - && (seen_key_count != (MIGRATION_DEBUG ? ~0ULL : 0))) { - pr_notice("found %llu keys on pass %u.", - seen_key_count, pass); - } + bkey_reassemble(&tmp.k, k); + bch_btree_iter_unlock(&iter); - last_error_count = atomic_read(&context.error_count); - last_error_flags = atomic_read(&context.error_flags); + seen_key_count++; + migrate_extent(c, ca, + bkey_i_to_s_c(&tmp.k), + &m); + } - if (last_error_count != 0) { - pr_notice("pass %u: error count = %u, error flags = 0x%x", - pass, last_error_count, last_error_flags); + bch_btree_iter_cond_resched(&iter); } + bch_btree_iter_unlock(&iter); + + closure_sync(&m.cl); } - if (seen_key_count != 0 || last_error_count != 0) { + if (seen_key_count) { pr_err("Unable to migrate all data in %d iterations.", MAX_DATA_OFF_ITER); - ret = -EDEADLK; - } else if (MIGRATION_DEBUG) - pr_notice("Migrated all data in %d iterations", pass); + return -EDEADLK; + } - bch_queue_run(queue, &context); - return ret; + return 0; } /* diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index bbfcbdae2f37..8e8ae4acb74c 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -1,75 +1,28 @@ #include "bcache.h" -#include "btree_gc.h" -#include "buckets.h" +#include "extents.h" #include "io.h" #include "move.h" -#include "super.h" -#include "keylist.h" #include <trace/events/bcache.h> -static void moving_error(struct moving_context *ctxt, unsigned flag) +void bch_moving_io_free(struct moving_io *io) { - atomic_inc(&ctxt->error_count); - atomic_or(flag, &ctxt->error_flags); -} - -void bch_moving_context_init(struct moving_context *ctxt, - struct bch_ratelimit *rate, - enum moving_purpose purpose) -{ - memset(ctxt, 0, sizeof(*ctxt)); - ctxt->task = current; - ctxt->rate = rate; - ctxt->purpose = purpose; - closure_init_stack(&ctxt->cl); -} - -/* - * bch_moving_wait() -- wait for a bch_moving_notify() call - * - * To deal with lost wakeups, we make this return immediately if notify - * was already called. - */ -void bch_moving_wait(struct moving_context *ctxt) -{ - while (1) { - set_current_state(TASK_INTERRUPTIBLE); - if (atomic_xchg(&ctxt->pending, 0)) - break; - schedule(); - } - __set_current_state(TASK_RUNNING); -} - -static void bch_moving_notify(struct moving_context *ctxt) -{ - atomic_set(&ctxt->pending, 1); - wake_up_process(ctxt->task); -} - -static bool __bch_queue_reads_pending(struct moving_queue *q) -{ - return (q->read_count > 0 || !RB_EMPTY_ROOT(&q->tree)); + bch_bio_free_pages(&io->bio.bio.bio); + kfree(io); } -static bool bch_queue_reads_pending(struct moving_queue *q) +static void bch_moving_io_destructor(struct closure *cl) { - unsigned long flags; - bool pending; + struct moving_io *io = container_of(cl, struct moving_io, cl); + struct move_context *m = container_of(cl->parent, + struct move_context, cl); + unsigned nr_pages = DIV_ROUND_UP(io->key.k.size, PAGE_SECTORS); - spin_lock_irqsave(&q->lock, flags); - pending = __bch_queue_reads_pending(q); - spin_unlock_irqrestore(&q->lock, flags); + while (nr_pages--) + up(&m->nr_pages_limit); - return pending; -} - -static void bch_queue_write(struct moving_queue *q) -{ - BUG_ON(q->wq == NULL); - queue_work(q->wq, &q->work); + bch_moving_io_free(io); } static void moving_init(struct moving_io *io, struct bio *bio) @@ -86,531 +39,83 @@ static void moving_init(struct moving_io *io, struct bio *bio) bch_bio_map(bio, NULL); } -struct moving_io *moving_io_alloc(struct bkey_s_c k) -{ - struct moving_io *io; - - io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) - * DIV_ROUND_UP(k.k->size, PAGE_SECTORS), - GFP_KERNEL); - if (!io) - return NULL; - - bkey_reassemble(&io->key, k); - - moving_init(io, &io->rbio.bio); - - if (bio_alloc_pages(&io->rbio.bio, GFP_KERNEL)) { - kfree(io); - return NULL; - } - - return io; -} - -void moving_io_free(struct moving_io *io) -{ - bch_bio_free_pages(&io->wbio.bio.bio); - kfree(io); -} - -static void moving_io_destructor(struct closure *cl) +static void write_moving(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_queue *q = io->q; - struct moving_context *ctxt = io->context; - unsigned long flags; - bool kick_writes = true; - - if (io->replace.failures) - trace_bcache_copy_collision(q, &io->key.k); - - spin_lock_irqsave(&q->lock, flags); - - BUG_ON(!q->count); - q->count--; - - if (io->read_issued) { - BUG_ON(!q->read_count); - q->read_count--; - } - - if (io->write_issued) { - BUG_ON(!q->write_count); - q->write_count--; - trace_bcache_move_write_done(q, &io->key.k); - } - - list_del_init(&io->list); - - if ((q->count == 0) && (q->stop_waitcl != NULL)) { - closure_put(q->stop_waitcl); - q->stop_waitcl = NULL; - } - - if (q->rotational && __bch_queue_reads_pending(q)) - kick_writes = false; - - if (list_empty(&q->pending)) - kick_writes = false; - - spin_unlock_irqrestore(&q->lock, flags); - - moving_io_free(io); - - if (kick_writes) - bch_queue_write(q); - - bch_moving_notify(ctxt); -} - -static void moving_io_after_write(struct closure *cl) -{ - struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_context *ctxt = io->context; if (io->op.error) - moving_error(ctxt, MOVING_FLAG_WRITE); - - moving_io_destructor(cl); -} - -static void write_moving(struct moving_io *io) -{ - bool stopped; - unsigned long flags; - struct bch_write_op *op = &io->op; - - spin_lock_irqsave(&io->q->lock, flags); - BUG_ON(io->q->count == 0); - stopped = io->q->stopped; - spin_unlock_irqrestore(&io->q->lock, flags); - - /* - * If the queue has been stopped, prevent the write from occurring. - * This stops all writes on a device going read-only as quickly - * as possible. - */ - - if (op->error || stopped) - closure_return_with_destructor(&io->cl, moving_io_destructor); - else { - moving_init(io, &io->wbio.bio.bio); - - op->bio->bio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k); - - closure_call(&op->cl, bch_write, NULL, &io->cl); - closure_return_with_destructor(&io->cl, moving_io_after_write); - } -} - -static void bch_queue_write_work(struct work_struct *work) -{ - struct moving_queue *q = container_of(work, struct moving_queue, work); - struct moving_io *io; - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - - if (q->rotational && __bch_queue_reads_pending(q)) { - /* All reads should have finished before writes start */ - spin_unlock_irqrestore(&q->lock, flags); - return; - } - - while (!q->stopped && q->write_count < q->max_write_count) { - io = list_first_entry_or_null(&q->pending, - struct moving_io, list); - /* - * We only issue the writes in insertion order to preserve - * any linearity in the original key list/tree, so if we - * find an io whose read hasn't completed, we don't - * scan beyond it. Eventually that read will complete, - * at which point we may issue multiple writes (for it - * and any following entries whose reads had already - * completed and we had not examined here). - */ - if (!io || !io->read_completed) - break; - - BUG_ON(io->write_issued); - q->write_count++; - io->write_issued = 1; - list_del(&io->list); - list_add_tail(&io->list, &q->write_pending); - trace_bcache_move_write(q, &io->key.k); - spin_unlock_irqrestore(&q->lock, flags); - write_moving(io); - spin_lock_irqsave(&q->lock, flags); - } - - spin_unlock_irqrestore(&q->lock, flags); -} - -/* - * IMPORTANT: The caller of queue_init must have zero-filled it when it - * allocates it. - */ - -int bch_queue_init(struct moving_queue *q, - struct cache_set *c, - unsigned max_size, - unsigned max_count, - unsigned max_read_count, - unsigned max_write_count, - bool rotational, - const char *name) -{ - INIT_WORK(&q->work, bch_queue_write_work); - - q->keys.owner = q; - q->max_count = max_count; - q->max_read_count = max_read_count; - q->max_write_count = max_write_count; - q->rotational = rotational; + closure_return_with_destructor(&io->cl, bch_moving_io_destructor); - spin_lock_init(&q->lock); - INIT_LIST_HEAD(&q->pending); - INIT_LIST_HEAD(&q->write_pending); - q->tree = RB_ROOT; + moving_init(io); - q->wq = alloc_workqueue(name, - WQ_UNBOUND|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1); - if (!q->wq) - return -ENOMEM; + io->op.bio->bio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k); - return 0; -} - -void bch_queue_start(struct moving_queue *q) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - q->stopped = false; - spin_unlock_irqrestore(&q->lock, flags); - - bch_scan_keylist_reset(&q->keys); -} - -void queue_io_resize(struct moving_queue *q, - unsigned max_io, - unsigned max_read, - unsigned max_write) -{ - unsigned long flags; - - spin_lock_irqsave(&q->lock, flags); - q->max_count = max_io; - q->max_read_count = max_read; - q->max_write_count = max_write; - spin_unlock_irqrestore(&q->lock, flags); -} - -void bch_queue_destroy(struct moving_queue *q) -{ - if (q->wq) - destroy_workqueue(q->wq); - q->wq = NULL; - - bch_scan_keylist_destroy(&q->keys); -} - -static void bch_queue_cancel_writes(struct moving_queue *q) -{ - struct moving_io *io; - unsigned long flags; - bool read_issued, read_completed; - - spin_lock_irqsave(&q->lock, flags); - - while (1) { - io = list_first_entry_or_null(&q->pending, - struct moving_io, - list); - if (!io) - break; - - BUG_ON(io->write_issued); - list_del_init(&io->list); - read_issued = io->read_issued; - read_completed = io->read_completed; - if (!read_issued && !read_completed && q->rotational) - rb_erase(&io->node, &q->tree); - spin_unlock_irqrestore(&q->lock, flags); - if (read_completed) - closure_return_with_destructor_noreturn(&io->cl, - moving_io_destructor); - else if (!read_issued) - moving_io_destructor(&io->cl); - spin_lock_irqsave(&q->lock, flags); - } - - spin_unlock_irqrestore(&q->lock, flags); -} - -void bch_queue_stop(struct moving_queue *q) -{ - unsigned long flags; - struct closure waitcl; - - closure_init_stack(&waitcl); - - spin_lock_irqsave(&q->lock, flags); - if (q->stopped) - BUG_ON(q->stop_waitcl != NULL); - else { - q->stopped = true; - if (q->count != 0) { - q->stop_waitcl = &waitcl; - closure_get(&waitcl); - } - } - spin_unlock_irqrestore(&q->lock, flags); - - bch_queue_cancel_writes(q); - - closure_sync(&waitcl); -} - -static void pending_recalc_oldest_gens(struct cache_set *c, struct list_head *l) -{ - struct moving_io *io; - - list_for_each_entry(io, l, list) { - /* - * This only marks the (replacement) key and not the - * insertion key in the bch_write_op, as the insertion - * key should be a subset of the replacement key except - * for any new pointers added by the write, and those - * don't need to be marked because they are pointing - * to open buckets until the write completes - */ - bch_btree_key_recalc_oldest_gen(c, bkey_i_to_s_c(&io->key)); - } -} - -void bch_queue_recalc_oldest_gens(struct cache_set *c, struct moving_queue *q) -{ - unsigned long flags; - - /* 1st, mark the keylist keys */ - bch_keylist_recalc_oldest_gens(c, &q->keys); - - /* 2nd, mark the keys in the I/Os */ - spin_lock_irqsave(&q->lock, flags); - - pending_recalc_oldest_gens(c, &q->pending); - pending_recalc_oldest_gens(c, &q->write_pending); - - spin_unlock_irqrestore(&q->lock, flags); + closure_call(&io->op.cl, bch_write, NULL, &io->cl); + closure_return_with_destructor(&io->cl, bch_moving_io_destructor); } static void read_moving_endio(struct bio *bio) { struct closure *cl = bio->bi_private; struct moving_io *io = container_of(cl, struct moving_io, cl); - struct moving_queue *q = io->q; - struct moving_context *ctxt = io->context; - bool stopped; - unsigned long flags; - - if (bio->bi_error) { + if (bio->bi_error) io->op.error = bio->bi_error; - moving_error(io->context, MOVING_FLAG_READ); - } - - bio_put(bio); - - spin_lock_irqsave(&q->lock, flags); - - trace_bcache_move_read_done(q, &io->key.k); - - BUG_ON(!io->read_issued); - BUG_ON(io->read_completed); - io->read_issued = 0; - io->read_completed = 1; - BUG_ON(!q->read_count); - q->read_count--; - stopped = q->stopped; - if (stopped) - list_del_init(&io->list); - spin_unlock_irqrestore(&q->lock, flags); - if (stopped) - closure_return_with_destructor(&io->cl, - moving_io_destructor); - else if (!q->rotational) - bch_queue_write(q); - - bch_moving_notify(ctxt); + closure_put(cl); } static void __bch_data_move(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct extent_pick_ptr pick; - u64 size = io->key.k.size; - bch_extent_pick_ptr_avoiding(io->op.c, bkey_i_to_s_c(&io->key), - io->context->avoid, &pick); + bch_extent_pick_ptr(io->op.c, + bkey_i_to_s_c(&io->key), + &pick); if (IS_ERR_OR_NULL(pick.ca)) - closure_return_with_destructor(cl, moving_io_destructor); - - io->context->keys_moved++; - io->context->sectors_moved += size; - if (io->context->rate) - bch_ratelimit_increment(io->context->rate, size); + closure_return_with_destructor(cl, bch_moving_io_destructor); io->rbio.bio.bi_rw = READ; io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&io->key.k); io->rbio.bio.bi_end_io = read_moving_endio; + closure_get(cl); bch_read_extent(io->op.c, &io->rbio, bkey_i_to_s_c(&io->key), &pick, BCH_READ_IS_LAST); -} - -/* - * bch_queue_full() - return if more reads can be queued with bch_data_move(). - * - * In rotational mode, always returns false if no reads are in flight (see - * how max_count is initialized in bch_queue_init()). - */ -bool bch_queue_full(struct moving_queue *q) -{ - unsigned long flags; - bool full; - spin_lock_irqsave(&q->lock, flags); - BUG_ON(q->count > q->max_count); - BUG_ON(q->read_count > q->max_read_count); - full = (q->count == q->max_count || - q->read_count == q->max_read_count); - spin_unlock_irqrestore(&q->lock, flags); - - return full; -} - -static int moving_io_cmp(struct moving_io *io1, struct moving_io *io2) -{ - if (io1->sort_key < io2->sort_key) - return -1; - else if (io1->sort_key > io2->sort_key) - return 1; - else { - /* We don't want duplicate keys. Eventually, we will have - * support for GC with duplicate pointers -- for now, - * just sort them randomly instead */ - if (io1 < io2) - return -1; - else if (io1 > io2) - return 1; - BUG(); - } + continue_at(cl, write_moving, io->op.io_wq); /* XXX different wq */ } -void bch_data_move(struct moving_queue *q, - struct moving_context *ctxt, - struct moving_io *io) +void bch_data_move(struct move_context *m, struct moving_io *io) { - unsigned long flags; - bool stopped = false; - - BUG_ON(q->wq == NULL); - io->q = q; - io->context = ctxt; - - spin_lock_irqsave(&q->lock, flags); - if (q->stopped) { - stopped = true; - goto out; - } - - q->count++; - list_add_tail(&io->list, &q->pending); - trace_bcache_move_read(q, &io->key.k); - - if (q->rotational) - BUG_ON(RB_INSERT(&q->tree, io, node, moving_io_cmp)); - else { - BUG_ON(io->read_issued); - io->read_issued = 1; - q->read_count++; - } + unsigned nr_pages = DIV_ROUND_UP(io->key.k.size, PAGE_SECTORS); -out: - spin_unlock_irqrestore(&q->lock, flags); + while (nr_pages--) + down(&m->nr_pages_limit); - if (stopped) - moving_io_free(io); - else if (!q->rotational) - closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl); + closure_call(&io->cl, __bch_data_move, NULL, &m->cl); } -/* Rotational device queues */ - -static bool bch_queue_read(struct moving_queue *q, - struct moving_context *ctxt) +struct moving_io *bch_moving_io_alloc(struct bkey_s_c k) { - unsigned long flags; - struct rb_node *node; struct moving_io *io; - bool stopped; - - BUG_ON(!q->rotational); - - spin_lock_irqsave(&q->lock, flags); - node = rb_first(&q->tree); - if (!node) { - spin_unlock_irqrestore(&q->lock, flags); - return false; - } - - io = rb_entry(node, struct moving_io, node); - rb_erase(node, &q->tree); - io->read_issued = 1; - q->read_count++; - stopped = q->stopped; - spin_unlock_irqrestore(&q->lock, flags); - if (stopped) { - moving_io_destructor(&io->cl); - return false; - } else { - closure_call(&io->cl, __bch_data_move, NULL, &ctxt->cl); - return true; - } -} - -void bch_queue_run(struct moving_queue *q, struct moving_context *ctxt) -{ - unsigned long flags; - bool full; - - if (!q->rotational) - goto sync; + io = kzalloc(sizeof(struct moving_io) + sizeof(struct bio_vec) + * DIV_ROUND_UP(k.k->size, PAGE_SECTORS), + GFP_KERNEL); + if (!io) + return NULL; - while (!bch_moving_context_wait(ctxt)) { - spin_lock_irqsave(&q->lock, flags); - full = (q->read_count == q->max_read_count); - spin_unlock_irqrestore(&q->lock, flags); + bkey_reassemble(&io->key, k); - if (full) { - bch_moving_wait(ctxt); - continue; - } + moving_init(io); - if (!bch_queue_read(q, ctxt)) - break; + if (bio_alloc_pages(&io->bio.bio.bio, GFP_KERNEL)) { + kfree(io); + return NULL; } - while (bch_queue_reads_pending(q)) - bch_moving_wait(ctxt); - - bch_queue_write(q); - -sync: - closure_sync(&ctxt->cl); + return io; } diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 2f9998e66e7f..4c5433e36abf 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -3,95 +3,25 @@ #include "buckets.h" #include "io_types.h" +#include <linux/semaphore.h> -enum moving_purpose { - MOVING_PURPOSE_UNKNOWN, /* Un-init */ - MOVING_PURPOSE_MIGRATION, - MOVING_PURPOSE_TIERING, - MOVING_PURPOSE_COPY_GC, -}; - -enum moving_flag_bitnos { - MOVING_FLAG_BITNO_READ = 0, - MOVING_FLAG_BITNO_WRITE, -}; - -#define MOVING_FLAG_READ (1U << MOVING_FLAG_BITNO_READ) -#define MOVING_FLAG_WRITE (1U << MOVING_FLAG_BITNO_WRITE) - -struct moving_context { - /* Closure for waiting on all reads and writes to complete */ +struct move_context { struct closure cl; - - /* Number and types of errors reported */ - atomic_t error_count; - atomic_t error_flags; - - /* If != 0, @task is waiting for a read or write to complete */ - atomic_t pending; - struct task_struct *task; - - /* Key and sector moves issued, updated from submission context */ - u64 keys_moved; - u64 sectors_moved; - - /* Last key scanned */ - struct bpos last_scanned; - - /* Rate-limiter counting submitted reads */ - struct bch_ratelimit *rate; - - /* Try to avoid reading the following device */ - struct cache *avoid; - - /* Debugging... */ - enum moving_purpose purpose; + struct semaphore nr_pages_limit; }; -void bch_moving_context_init(struct moving_context *, struct bch_ratelimit *, - enum moving_purpose); - -static inline int bch_moving_context_wait(struct moving_context *ctxt) +static inline void move_context_init(struct move_context *m) { - if (ctxt->rate == NULL) - return 0; - - return bch_ratelimit_wait_freezable_stoppable(ctxt->rate, &ctxt->cl); + closure_init_stack(&m->cl); + sema_init(&m->nr_pages_limit, (8 << 20) / PAGE_SIZE); } -void bch_moving_wait(struct moving_context *); - struct moving_io { - struct list_head list; - struct rb_node node; struct closure cl; - struct moving_queue *q; + struct bch_write_op op; struct bch_replace_info replace; - struct moving_context *context; BKEY_PADDED(key); - /* Sort key for moving_queue->tree */ - u64 sort_key; - /* Protected by q->lock */ - - /* - * 1) !read_issued && !read_completed - * - Closure is not running yet, starts when read_issued is set - * - IO is in q->tree (if q->rotational) and q->pending - * 2) !write_issued && !write_completed: - * - IO is in q->pending - * 3) write_issued: - * - IO is in q->write_pending - * 4) write_completed: - * - Closure is about to return and the IO is about to be freed - * - * If read_issued, we hold a reference on q->read_count - * If write_issued, we hold a reference on q->write_count - * Until IO is freed, we hold a reference on q->count - */ - unsigned read_issued:1; - unsigned read_completed:1; - unsigned write_issued:1; struct bch_read_bio rbio; struct bch_write_bio wbio; @@ -99,67 +29,8 @@ struct moving_io { struct bio_vec bi_inline_vecs[0]; }; -struct moving_io *moving_io_alloc(struct bkey_s_c); -void moving_io_free(struct moving_io *); - -typedef struct moving_io *(moving_queue_fn)(struct moving_queue *, - struct moving_context *); - -int bch_queue_init(struct moving_queue *, - struct cache_set *, - unsigned max_keys, - unsigned max_ios, - unsigned max_reads, - unsigned max_writes, - bool rotational, - const char *); -void bch_queue_start(struct moving_queue *); -bool bch_queue_full(struct moving_queue *); -void bch_data_move(struct moving_queue *, - struct moving_context *, - struct moving_io *); -void queue_io_resize(struct moving_queue *, - unsigned, - unsigned, - unsigned); -void bch_queue_destroy(struct moving_queue *); -void bch_queue_stop(struct moving_queue *); - -void bch_queue_recalc_oldest_gens(struct cache_set *, struct moving_queue *); - -void bch_queue_run(struct moving_queue *, struct moving_context *); - -#define sysfs_queue_attribute(name) \ - rw_attribute(name##_max_count); \ - rw_attribute(name##_max_read_count); \ - rw_attribute(name##_max_write_count); \ - rw_attribute(name##_max_keys) - -#define sysfs_queue_files(name) \ - &sysfs_##name##_max_count, \ - &sysfs_##name##_max_read_count, \ - &sysfs_##name##_max_write_count, \ - &sysfs_##name##_max_keys - -#define sysfs_queue_show(name, var) \ -do { \ - sysfs_hprint(name##_max_count, (var)->max_count); \ - sysfs_print(name##_max_read_count, (var)->max_read_count); \ - sysfs_print(name##_max_write_count, (var)->max_write_count);\ - sysfs_print(name##_max_keys, bch_scan_keylist_size(&(var)->keys));\ -} while (0) - -#define sysfs_queue_store(name, var) \ -do { \ - sysfs_strtoul(name##_max_count, (var)->max_count); \ - sysfs_strtoul(name##_max_read_count, (var)->max_read_count); \ - sysfs_strtoul(name##_max_write_count, (var)->max_write_count); \ - if (attr == &sysfs_##name##_max_keys) { \ - int v = strtoi_h_or_return(buf); \ - \ - v = clamp(v, 2, KEYLIST_MAX); \ - bch_scan_keylist_resize(&(var)->keys, v); \ - } \ -} while (0) +void bch_moving_io_free(struct moving_io *); +struct moving_io *bch_moving_io_alloc(struct bkey_s_c); +void bch_data_move(struct move_context *, struct moving_io *); #endif /* _BCACHE_MOVE_H */ diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h deleted file mode 100644 index d5e1a4a968fa..000000000000 --- a/fs/bcachefs/move_types.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef _BCACHE_MOVE_TYPES_H -#define _BCACHE_MOVE_TYPES_H - -/* - * We rely on moving_queue being kzalloc'd so that the initial value of - * the flags is 0. - */ - -struct moving_queue { - struct work_struct work; - struct scan_keylist keys; - struct workqueue_struct *wq; - - /* Configuration */ - unsigned max_count; - unsigned max_read_count; - unsigned max_write_count; - - /* - * If true, reads are coming from rotational media. All reads - * are queued up on @tree and sorted by physical location prior - * to being submitted. - */ - bool rotational; - - /* This can be examined without locking */ - bool stopped; - - /* Protects everything below */ - spinlock_t lock; - - struct closure *stop_waitcl; - - /* - * Tree of struct moving_io, sorted by moving_io->sort_key. - * Contains reads which have not yet been issued; when a read is - * issued, it is removed from the tree. - * - * Only used if @rotational is set. - */ - struct rb_root tree; - - /* - * List of struct moving_io, sorted by logical offset. - * Contains writes which have not yet been issued; when a write is - * issued, it is removed from the list. - * - * Writes are issued in logical offset order, and only when all - * prior writes have been issued. - */ - struct list_head pending; - - /* - * List of struct moving_io, sorted by logical offset. - * - * Contains writes which are in-flight. - */ - struct list_head write_pending; - - unsigned count; - unsigned read_count; - unsigned write_count; -}; - -#endif /* _BCACHE_MOVE_TYPES_H */ diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c deleted file mode 100644 index 0c77ea6c808c..000000000000 --- a/fs/bcachefs/movinggc.c +++ /dev/null @@ -1,367 +0,0 @@ -/* - * Moving/copying garbage collector - * - * Copyright 2012 Google, Inc. - */ - -#include "bcache.h" -#include "buckets.h" -#include "clock.h" -#include "extents.h" -#include "io.h" -#include "keylist.h" -#include "move.h" -#include "movinggc.h" - -#include <trace/events/bcache.h> -#include <linux/freezer.h> -#include <linux/kthread.h> - -/* Moving GC - IO loop */ - -static bool moving_pred(struct scan_keylist *kl, struct bkey_s_c k) -{ - struct cache *ca = container_of(kl, struct cache, - moving_gc_queue.keys); - struct cache_set *c = ca->set; - const struct bch_extent_ptr *ptr; - bool ret = false; - - if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - - rcu_read_lock(); - extent_for_each_ptr(e, ptr) - if (PTR_CACHE(c, ptr) == ca && - PTR_BUCKET(ca, ptr)->copygc_gen) - ret = true; - rcu_read_unlock(); - } - - return ret; -} - -static int issue_moving_gc_move(struct moving_queue *q, - struct moving_context *ctxt, - struct bkey_i *k) -{ - struct cache *ca = container_of(q, struct cache, moving_gc_queue); - struct cache_set *c = ca->set; - struct bkey_s_extent e; - struct bch_extent_ptr *ptr; - struct moving_io *io; - unsigned gen; - - io = moving_io_alloc(bkey_i_to_s_c(k)); - if (!io) { - trace_bcache_moving_gc_alloc_fail(c, k->k.size); - return -ENOMEM; - } - - bch_replace_init(&io->replace, bkey_i_to_s_c(k)); - - bch_write_op_init(&io->op, c, &io->wbio, - (struct disk_reservation) { 0 }, - NULL, bkey_i_to_s_c(k), - &io->replace.hook, NULL, - bkey_extent_is_cached(&k->k) - ? BCH_WRITE_CACHED : 0); - io->op.nr_replicas = 1; - - e = bkey_i_to_s_extent(&io->op.insert_key); - - extent_for_each_ptr(e, ptr) - if ((ca->sb.nr_this_dev == ptr->dev) && - (gen = PTR_BUCKET(ca, ptr)->copygc_gen)) { - gen--; - BUG_ON(gen > ARRAY_SIZE(ca->gc_buckets)); - io->op.wp = &ca->gc_buckets[gen]; - io->sort_key = ptr->offset; - bch_extent_drop_ptr(e, ptr); - goto found; - } - - /* We raced - bucket's been reused */ - moving_io_free(io); - goto out; -found: - trace_bcache_gc_copy(&k->k); - - /* - * IMPORTANT: We must call bch_data_move before we dequeue so - * that the key can always be found in either the pending list - * in the moving queue or in the scan keylist list in the - * moving queue. - * If we reorder, there is a window where a key is not found - * by btree gc marking. - */ - bch_data_move(q, ctxt, io); -out: - bch_scan_keylist_dequeue(&q->keys); - return 0; -} - -static void read_moving(struct cache *ca, struct moving_context *ctxt) -{ - struct bkey_i *k; - bool again; - - bch_ratelimit_reset(&ca->moving_gc_pd.rate); - - do { - again = false; - - while (!bch_moving_context_wait(ctxt)) { - if (bch_queue_full(&ca->moving_gc_queue)) { - if (ca->moving_gc_queue.rotational) { - again = true; - break; - } else { - bch_moving_wait(ctxt); - continue; - } - } - - k = bch_scan_keylist_next_rescan( - ca->set, - &ca->moving_gc_queue.keys, - &ctxt->last_scanned, - POS_MAX, - moving_pred); - - if (k == NULL) - break; - - if (issue_moving_gc_move(&ca->moving_gc_queue, - ctxt, k)) { - /* - * Memory allocation failed; we will wait for - * all queued moves to finish and continue - * scanning starting from the same key - */ - again = true; - break; - } - } - - bch_queue_run(&ca->moving_gc_queue, ctxt); - } while (!kthread_should_stop() && again); -} - -static void bch_moving_gc(struct cache *ca) -{ - struct cache_set *c = ca->set; - struct bucket *g; - - u64 sectors_to_move, sectors_gen, gen_current, sectors_total; - size_t buckets_to_move, buckets_unused = 0; - struct bucket_heap_entry e; - unsigned sectors_used, i; - int reserve_sectors; - - struct moving_context ctxt; - - bch_moving_context_init(&ctxt, &ca->moving_gc_pd.rate, - MOVING_PURPOSE_COPY_GC); - - /* - * We won't fill up the moving GC reserve completely if the data - * being copied is from different generations. In the worst case, - * there will be NUM_GC_GENS buckets of internal fragmentation - */ - - spin_lock(&ca->freelist_lock); - reserve_sectors = ca->mi.bucket_size * - (fifo_used(&ca->free[RESERVE_MOVINGGC]) - NUM_GC_GENS); - spin_unlock(&ca->freelist_lock); - - if (reserve_sectors < (int) c->sb.block_size) { - trace_bcache_moving_gc_reserve_empty(ca); - return; - } - - trace_bcache_moving_gc_start(ca); - - /* - * Find buckets with lowest sector counts, skipping completely - * empty buckets, by building a maxheap sorted by sector count, - * and repeatedly replacing the maximum element until all - * buckets have been visited. - */ - - mutex_lock(&ca->heap_lock); - ca->heap.used = 0; - for_each_bucket(g, ca) { - g->copygc_gen = 0; - - if (bucket_unused(g)) { - buckets_unused++; - continue; - } - - if (g->mark.owned_by_allocator || - g->mark.is_metadata) - continue; - - sectors_used = bucket_sectors_used(g); - - if (sectors_used >= ca->mi.bucket_size) - continue; - - bucket_heap_push(ca, g, sectors_used); - } - - sectors_to_move = 0; - for (i = 0; i < ca->heap.used; i++) - sectors_to_move += ca->heap.data[i].val; - - /* XXX: calculate this threshold rigorously */ - - if (ca->heap.used < ca->free_inc.size / 2 && - sectors_to_move < reserve_sectors) { - mutex_unlock(&ca->heap_lock); - trace_bcache_moving_gc_no_work(ca); - return; - } - - while (sectors_to_move > reserve_sectors) { - BUG_ON(!heap_pop(&ca->heap, e, bucket_min_cmp)); - sectors_to_move -= e.val; - } - - buckets_to_move = ca->heap.used; - - /* - * resort by write_prio to group into generations, attempts to - * keep hot and cold data in the same locality. - */ - - mutex_lock(&ca->set->bucket_lock); - for (i = 0; i < ca->heap.used; i++) { - struct bucket_heap_entry *e = &ca->heap.data[i]; - - e->val = (c->prio_clock[WRITE].hand - e->g->write_prio); - } - - heap_resort(&ca->heap, bucket_max_cmp); - - sectors_gen = sectors_to_move / NUM_GC_GENS; - gen_current = 1; - sectors_total = 0; - - while (heap_pop(&ca->heap, e, bucket_max_cmp)) { - sectors_total += bucket_sectors_used(e.g); - e.g->copygc_gen = gen_current; - if (gen_current < NUM_GC_GENS && - sectors_total >= sectors_gen * gen_current) - gen_current++; - } - mutex_unlock(&ca->set->bucket_lock); - - mutex_unlock(&ca->heap_lock); - - read_moving(ca, &ctxt); - - trace_bcache_moving_gc_end(ca, ctxt.sectors_moved, ctxt.keys_moved, - buckets_to_move); -} - -static int bch_moving_gc_thread(void *arg) -{ - struct cache *ca = arg; - struct cache_set *c = ca->set; - struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last; - s64 next; - - set_freezable(); - - while (!kthread_should_stop()) { - if (kthread_wait_freezable(c->copy_gc_enabled)) - break; - - last = atomic_long_read(&clock->now); - /* - * don't start copygc until less than half the gc reserve is - * available: - */ - next = (buckets_available_cache(ca) - - div64_u64((ca->mi.nbuckets - ca->mi.first_bucket) * - c->opts.gc_reserve_percent, 200)) * - ca->mi.bucket_size; - - if (next <= 0) - bch_moving_gc(ca); - else - bch_kthread_io_clock_wait(clock, last + next); - } - - return 0; -} - -#define MOVING_GC_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE -#define MOVING_GC_NR 64 -#define MOVING_GC_READ_NR 32 -#define MOVING_GC_WRITE_NR 32 - -int bch_moving_init_cache(struct cache *ca) -{ - bool rotational = !blk_queue_nonrot(bdev_get_queue(ca->disk_sb.bdev)); - - bch_pd_controller_init(&ca->moving_gc_pd); - ca->moving_gc_pd.d_term = 0; - - return bch_queue_init(&ca->moving_gc_queue, - ca->set, - MOVING_GC_KEYS_MAX_SIZE, - MOVING_GC_NR, - MOVING_GC_READ_NR, - MOVING_GC_WRITE_NR, - rotational, - "bch_copygc_write"); -} - -int bch_moving_gc_thread_start(struct cache *ca) -{ - struct task_struct *t; - - /* The moving gc read thread must be stopped */ - BUG_ON(ca->moving_gc_read != NULL); - - bch_queue_start(&ca->moving_gc_queue); - - if (cache_set_init_fault("moving_gc_start")) - return -ENOMEM; - - t = kthread_create(bch_moving_gc_thread, ca, "bch_copygc_read"); - if (IS_ERR(t)) - return PTR_ERR(t); - - ca->moving_gc_read = t; - wake_up_process(ca->moving_gc_read); - - return 0; -} - -void bch_moving_gc_stop(struct cache *ca) -{ - ca->moving_gc_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&ca->moving_gc_pd.rate); - - bch_queue_stop(&ca->moving_gc_queue); - - if (ca->moving_gc_read) - kthread_stop(ca->moving_gc_read); - ca->moving_gc_read = NULL; - - /* - * Make sure that it is empty so that gc marking doesn't keep - * marking stale entries from when last used. - */ - bch_scan_keylist_reset(&ca->moving_gc_queue.keys); -} - -void bch_moving_gc_destroy(struct cache *ca) -{ - bch_queue_destroy(&ca->moving_gc_queue); -} diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h deleted file mode 100644 index 5d09e0fa3ae1..000000000000 --- a/fs/bcachefs/movinggc.h +++ /dev/null @@ -1,9 +0,0 @@ -#ifndef _BCACHE_MOVINGGC_H -#define _BCACHE_MOVINGGC_H - -int bch_moving_init_cache(struct cache *); -void bch_moving_gc_stop(struct cache *); -int bch_moving_gc_thread_start(struct cache *); -void bch_moving_gc_destroy(struct cache *); - -#endif diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c new file mode 100644 index 000000000000..cedad5462da3 --- /dev/null +++ b/fs/bcachefs/rebalance.c @@ -0,0 +1,467 @@ +/* + * Copygc, tiering: + */ + +#include "bcache.h" +#include "btree_iter.h" +#include "buckets.h" +#include "clock.h" +#include "io.h" +#include "move.h" + +#include <trace/events/bcache.h> +#include <linux/freezer.h> +#include <linux/kthread.h> +#include <linux/bsearch.h> +#include <linux/sort.h> + +/* + * XXX preserve ordering when reads complete out of order + * + * do performance testing with disk write cache off + */ + +static inline bool rebalance_entry_sectors_cmp(struct rebalance_bucket_entry l, + struct rebalance_bucket_entry r) +{ + return l.sectors < r.sectors; +} + +static int rebalance_entry_bucket_cmp(const void *_l, const void *_r) +{ + const struct rebalance_bucket_entry *l = _l; + const struct rebalance_bucket_entry *r = _r; + + if (l->dev != r->dev) + return l->dev < r->dev ? -1 : 1; + if (l->bucket != r->bucket) + return l->bucket < r->bucket ? -1 : 1; + return 0; +} + +static inline void rebalance_heap_push(struct rebalance_thread *r, + size_t bucket, u8 dev, + u8 gen, unsigned sectors) +{ + struct rebalance_bucket_entry new = { + .bucket = bucket, + .dev = dev, + .gen = gen, + .sectors = sectors, + }; + + if (!heap_full(&r->heap)) + heap_add(&r->heap, new, rebalance_entry_sectors_cmp); + else if (rebalance_entry_sectors_cmp(new, heap_peek(&r->heap))) { + r->heap.data[0] = new; + heap_sift(&r->heap, 0, rebalance_entry_sectors_cmp); + } +} + +/* returns nr of extents that should be written to this tier: */ +static unsigned should_tier_extent(struct cache_set *c, + struct rebalance_thread *r, + struct cache_member_rcu *mi, + struct bkey_s_c_extent e) +{ + const struct bch_extent_ptr *ptr; + unsigned replicas = 0; + + /* Make sure we have room to add a new pointer: */ + if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > + BKEY_EXTENT_VAL_U64s_MAX) + return false; + + extent_for_each_ptr(e, ptr) + if (PTR_TIER(mi, ptr) >= r->tier) + replicas++; + + return replicas < c->opts.data_replicas + ? c->opts.data_replicas - replicas + : 0; +} + +static bool should_copygc_ptr(struct cache_set *c, + struct rebalance_thread *r, + struct cache_member_rcu *mi, + const struct bch_extent_ptr *ptr) +{ + struct cache *ca; + bool ret = false; + + if (PTR_TIER(mi, ptr) == r->tier && + (ca = PTR_CACHE(c, ptr))) { + struct rebalance_bucket_entry *e, s = { + .dev = ptr->dev, + .bucket = PTR_BUCKET_NR(ca, ptr), + }; + + mutex_lock(&r->heap_lock); + + e = bsearch(&s, + r->heap.data, + r->heap.used, + sizeof(r->heap.data[0]), + rebalance_entry_bucket_cmp); + if (e && + e->gen == ptr->gen && + e->gen == PTR_BUCKET_GEN(ca, ptr)) + ret = true; + + mutex_unlock(&r->heap_lock); + } + + return ret; +} + +static bool rebalance_pred(struct cache_set *c, + struct rebalance_thread *r, + struct bkey_s_c k) +{ + bool need_tier = false, need_copygc = false; + + if (bkey_extent_is_data(k.k)) { + struct bkey_s_c_extent e = bkey_s_c_to_extent(k); + const struct bch_extent_ptr *ptr; + struct cache_member_rcu *mi = cache_member_info_get(c); + + if (should_tier_extent(c, r, mi, e)) + need_tier = true; + + extent_for_each_ptr(e, ptr) + if (should_copygc_ptr(c, r, mi, ptr)) + need_copygc = true; + + cache_member_info_put(); + } + + return need_tier || need_copygc; +} + +static int rebalance_extent(struct cache_set *c, + struct rebalance_thread *r, + struct bkey_s_c k, + struct move_context *m) +{ + struct bkey_s_extent e; + struct bch_extent_ptr *ptr; + struct moving_io *io; + unsigned nr_new_extents; + bool have_faster_extent = false; + struct cache_member_rcu *mi; + + io = bch_moving_io_alloc(k); + if (!io) { + //trace_bcache_moving_gc_alloc_fail(c, k.k->size); + return -ENOMEM; + } + + bch_replace_init(&io->replace, k); + + /* How the piss are reserves going to work? */ + + bch_write_op_init(&io->op, c, &io->bio, + (struct disk_reservation) { 0 }, + &r->wp, k, + &io->replace.hook, NULL, + bkey_extent_is_cached(k.k) + ? BCH_WRITE_CACHED : 0); + + io->op.io_wq = r->wq; + + e = bkey_i_to_s_extent(&io->op.insert_key); + + mi = cache_member_info_get(c); + + nr_new_extents = should_tier_extent(c, r, mi, e.c); + + extent_for_each_ptr_backwards(e, ptr) { + if (PTR_TIER(mi, ptr) < r->tier) { + if (have_faster_extent) + bch_extent_drop_ptr(e, ptr); + else + have_faster_extent = true; + } + + if (should_copygc_ptr(c, r, mi, ptr)) { + bch_extent_drop_ptr(e, ptr); + nr_new_extents++; + } + } + + cache_member_info_put(); + + if (!nr_new_extents) { + /* We raced - bucket's been reused */ + bch_moving_io_free(io); + return 0; + } + io->op.nr_replicas = nr_new_extents; + + bch_data_move(m, io); + return 0; +} + +static void rebalance_walk_extents(struct cache_set *c, + struct rebalance_thread *r) +{ + struct btree_iter iter; + struct bkey_s_c k; + struct move_context m; + + move_context_init(&m); + bch_ratelimit_reset(&r->pd.rate); + + for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, POS_MIN, k) { + if (kthread_should_stop()) + break; + + if (rebalance_pred(c, r, k)) { + BKEY_PADDED(k) tmp; + + bkey_reassemble(&tmp.k, k); + bch_btree_iter_unlock(&iter); + + rebalance_extent(c, r, + bkey_i_to_s_c(&tmp.k), + &m); + } + + bch_btree_iter_cond_resched(&iter); + } + bch_btree_iter_unlock(&iter); + + closure_sync(&m.cl); +} + +static void bch_rebalance(struct cache_set *c, struct rebalance_thread *r) +{ + struct cache_group devs, *tier = &c->cache_tiers[r->tier]; + struct rebalance_bucket_entry e; + unsigned i, seq, sectors_used; + u64 sectors_to_move, reserve_sectors = 0; + size_t buckets_unused = 0; + + rcu_read_lock(); + + do { + seq = read_seqcount_begin(&tier->lock); + devs = *tier; + } while (read_seqcount_retry(&tier->lock, seq)); + + for (i = 0; i < devs.nr_devices; i++) + percpu_ref_get(&rcu_dereference(devs.devices[i])->ref); + + rcu_read_unlock(); + + mutex_lock(&r->heap_lock); + + r->heap.used = 0; + + for (i = 0; i < devs.nr_devices; i++) { + struct cache *ca = + rcu_dereference_protected(devs.devices[i], 1); + size_t bucket; + + spin_lock(&ca->freelist_lock); + reserve_sectors += ca->mi.bucket_size * + fifo_used(&ca->free[RESERVE_MOVINGGC]); + spin_unlock(&ca->freelist_lock); + + for (bucket = ca->mi.first_bucket; + bucket < ca->mi.nbuckets; + bucket++) { + struct bucket *g = ca->buckets + bucket; + + if (bucket_unused(g)) { + buckets_unused++; + continue; + } + + if (g->mark.owned_by_allocator || + g->mark.is_metadata) + continue; + + sectors_used = bucket_sectors_used(g); + + if (sectors_used >= ca->mi.bucket_size) + continue; + + rebalance_heap_push(r, bucket, ca->sb.nr_this_dev, + ca->bucket_gens[bucket], + sectors_used); + } + } + + /* + * Problems... + * XXX: wait on the allocator? perhaps the allocator just hasn't + * invalidated/discarded buckets we freed up from our last run? + */ + if (!reserve_sectors) + goto out_put; + + sectors_to_move = 0; + for (i = 0; i < r->heap.used; i++) + sectors_to_move += r->heap.data[i].sectors; + + /* + * If there's not enough work to do, bail out so we aren't scanning the + * btree unnecessarily: + * + * XXX: calculate this threshold rigorously + */ +#if 0 + if (r->heap.used < ca->free_inc.size / 2 && + sectors_to_move < reserve_sectors) + goto out_put; +#endif + + /* Pop buckets off until the they fit into our reserve: */ + while (sectors_to_move > reserve_sectors) { + BUG_ON(!heap_pop(&r->heap, e, rebalance_entry_sectors_cmp)); + sectors_to_move -= e.sectors; + } + + sort(r->heap.data, + r->heap.used, + sizeof(r->heap.data[0]), + rebalance_entry_bucket_cmp, + NULL); + + mutex_unlock(&r->heap_lock); + + for (i = 0; i < devs.nr_devices; i++) + percpu_ref_put(&rcu_dereference_protected(devs.devices[i], + 1)->ref); + + rebalance_walk_extents(c, r); + return; + +out_put: + mutex_unlock(&r->heap_lock); + for (i = 0; i < devs.nr_devices; i++) + percpu_ref_put(&rcu_dereference(devs.devices[i])->ref); +} + +static int bch_rebalance_thread(void *arg) +{ + struct rebalance_thread *r = arg; + struct cache_set *c = container_of(r, struct cache_set, + rebalance[r->tier]); + struct io_clock *clock = &c->io_clock[WRITE]; + unsigned long last; + //bool moved; + + while (!kthread_should_stop()) { + if (kthread_wait_freezable(c->copy_gc_enabled || + c->tiering_enabled)) + break; + + last = atomic_long_read(&clock->now); + + bch_rebalance(c, r); + + /* + * This really should be a library code, but it has to be + * kthread specific... ugh + */ +#if 0 + if (!moved) + bch_kthread_io_clock_wait(clock, + last + ca->free_inc.size / 2); +#endif + } + + return 0; +} + +static void bch_rebalance_exit_tier(struct rebalance_thread *r) +{ + if (r->p) + kthread_stop(r->p); + r->p = NULL; + if (r->wq) + destroy_workqueue(r->wq); + r->wq = NULL; + free_heap(&r->heap); +} + +void bch_rebalance_exit(struct cache_set *c) +{ + unsigned i; + + for (i = 0; i < ARRAY_SIZE(c->rebalance); i++) + bch_rebalance_exit_tier(&c->rebalance[i]); +} + +/* + * Called whenever we add a device - initializes the per tier rebalance thread, + * or resizes the heap if necessary + */ +int bch_rebalance_init(struct cache_set *c, struct cache *ca) +{ + unsigned tier = ca->mi.tier; + struct rebalance_thread *r = &c->rebalance[tier]; + struct task_struct *p; + u64 nbuckets = 0; + size_t heap_size; + unsigned i; + typeof(r->heap) old_heap; + + lockdep_assert_held(&bch_register_lock); + + if (!r->initialized) { + r->tier = tier; + mutex_init(&r->heap_lock); + r->wp.group = &c->cache_tiers[tier]; + r->wp.reserve = RESERVE_MOVINGGC; /* XXX */ + r->initialized = 1; + } + + if (!r->wq) + r->wq = create_workqueue("bch_rebalance_io"); + if (!r->wq) + return -ENOMEM; + + if (!r->p) { + p = kthread_create(bch_rebalance_thread, r, + "bch_rebalance"); + if (IS_ERR(p)) + return PTR_ERR(p); + + r->p = p; + } + + /* ca hasn't been added to array of devices yet: */ + nbuckets += ca->mi.nbuckets; + + rcu_read_lock(); + for_each_cache_rcu(ca, c, i) + if (ca->mi.tier == tier) + nbuckets += ca->mi.nbuckets; + rcu_read_unlock(); + + mutex_lock(&r->heap_lock); + old_heap = r->heap; + + heap_size = max_t(size_t, nbuckets >> 7, old_heap.used); + BUG_ON(!heap_size); + + if (!init_heap(&r->heap, heap_size, GFP_KERNEL)) { + mutex_unlock(&r->heap_lock); + return -ENOMEM; + } + + if (old_heap.data) { + memcpy(r->heap.data, + old_heap.data, + sizeof(old_heap.data[0]) * old_heap.used); + r->heap.used = old_heap.used; + free_heap(&old_heap); + } + + mutex_unlock(&r->heap_lock); + + return 0; +} diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h new file mode 100644 index 000000000000..3a15dff7bdff --- /dev/null +++ b/fs/bcachefs/rebalance.h @@ -0,0 +1,7 @@ +#ifndef _BCACHE_REBALANCE_H +#define _BCACHE_REBALANCE_H + +void bch_rebalance_exit(struct cache_set *); +int bch_rebalance_init(struct cache_set *, struct cache *ca); + +#endif /* _BCACHE_REBALANCE_H */ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ef8fb0dac003..beb0587be4ce 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -24,11 +24,10 @@ #include "keylist.h" #include "move.h" #include "migrate.h" -#include "movinggc.h" #include "notify.h" +#include "rebalance.h" #include "stats.h" #include "super.h" -#include "tier.h" #include "writeback.h" #include <linux/backing-dev.h> @@ -683,15 +682,6 @@ static void __bch_cache_set_read_only(struct cache_set *c) struct cache *ca; unsigned i; - c->tiering_pd.rate.rate = UINT_MAX; - bch_ratelimit_reset(&c->tiering_pd.rate); - bch_tiering_read_stop(c); - - for_each_cache(ca, c, i) { - bch_tiering_write_stop(ca); - bch_moving_gc_stop(ca); - } - bch_gc_thread_stop(c); bch_btree_flush(c); @@ -804,7 +794,6 @@ void bch_cache_set_read_only_sync(struct cache_set *c) static const char *__bch_cache_set_read_write(struct cache_set *c) { - struct cache *ca; const char *err; unsigned i; @@ -822,22 +811,9 @@ static const char *__bch_cache_set_read_write(struct cache_set *c) if (bch_gc_thread_start(c)) goto err; - for_each_cache(ca, c, i) { - if (ca->mi.state != CACHE_ACTIVE) - continue; - - err = "error starting moving GC thread"; - if (bch_moving_gc_thread_start(ca)) { - percpu_ref_put(&ca->ref); - goto err; - } - - bch_tiering_write_start(ca); - } - - err = "error starting tiering thread"; - if (bch_tiering_read_start(c)) - goto err; + for (i = 0; i < ARRAY_SIZE(c->rebalance); i++) + if (c->rebalance[i].p) + wake_up_process(c->rebalance[i].p); schedule_delayed_work(&c->pd_controllers_update, 5 * HZ); @@ -877,6 +853,7 @@ static void cache_set_free(struct cache_set *c) cancel_work_sync(&c->bio_submit_work); cancel_work_sync(&c->read_retry_work); + bch_rebalance_exit(c); bch_bset_sort_state_free(&c->sort); bch_btree_cache_free(c); bch_journal_free(&c->journal); @@ -1061,11 +1038,8 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, mutex_init(&c->btree_root_lock); INIT_WORK(&c->read_only_work, bch_cache_set_read_only_work); mutex_init(&c->mi_lock); - init_rwsem(&c->gc_lock); mutex_init(&c->trigger_gc_lock); - mutex_init(&c->gc_scan_keylist_lock); - INIT_LIST_HEAD(&c->gc_scan_keylists); #define BCH_TIME_STAT(name, frequency_units, duration_units) \ spin_lock_init(&c->name##_time.lock); @@ -1073,7 +1047,6 @@ static struct cache_set *bch_cache_set_alloc(struct cache_sb *sb, #undef BCH_TIME_STAT bch_open_buckets_init(c); - bch_tiering_init_cache_set(c); INIT_LIST_HEAD(&c->list); INIT_LIST_HEAD(&c->cached_devs); @@ -1507,8 +1480,7 @@ static void __bch_cache_read_only(struct cache *ca) { trace_bcache_cache_read_only(ca); - bch_tiering_write_stop(ca); - bch_moving_gc_stop(ca); + /* XXX do stuff with rebalance thread */ /* * This stops new data writes (e.g. to existing open data @@ -1564,19 +1536,12 @@ static const char *__bch_cache_read_write(struct cache *ca) trace_bcache_cache_read_write(ca); - bch_tiering_write_start(ca); - - trace_bcache_cache_read_write_done(ca); - - /* XXX wtf? */ - return NULL; - - err = "error starting moving GC thread"; - if (!bch_moving_gc_thread_start(ca)) - err = NULL; + if (bch_cache_allocator_start(ca)) + return "error starting allocator thread"; - wake_up_process(ca->set->tiering_read); + /* XXX notify rebalance thread? */ + trace_bcache_cache_read_write_done(ca); bch_notify_cache_read_write(ca); return err; @@ -1633,8 +1598,6 @@ static void bch_cache_free_work(struct work_struct *work) * to unregister them before we drop our reference to * @c. */ - bch_moving_gc_destroy(ca); - bch_tiering_write_destroy(ca); cancel_work_sync(&ca->io_error_work); @@ -1890,9 +1853,6 @@ static const char *cache_alloc(struct bcache_superblock *sb, kobject_init(&ca->kobj, &bch_cache_ktype); - seqcount_init(&ca->self.lock); - ca->self.nr_devices = 1; - rcu_assign_pointer(ca->self.devices[0], ca); ca->sb.nr_this_dev = sb->sb->nr_this_dev; INIT_WORK(&ca->free_work, bch_cache_free_work); @@ -1919,8 +1879,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, ca->bucket_bits = ilog2(ca->mi.bucket_size); /* XXX: tune these */ - movinggc_reserve = max_t(size_t, NUM_GC_GENS * 2, - ca->mi.nbuckets >> 7); + movinggc_reserve = ca->mi.nbuckets >> 7; reserve_none = max_t(size_t, 4, ca->mi.nbuckets >> 9); free_inc_reserve = reserve_none << 1; heap_size = max_t(size_t, free_inc_reserve, movinggc_reserve); @@ -1946,8 +1905,7 @@ static const char *cache_alloc(struct bcache_superblock *sb, bioset_init(&ca->replica_set, 4, offsetof(struct bch_write_bio, bio.bio)) || !(ca->sectors_written = alloc_percpu(*ca->sectors_written)) || - bch_moving_init_cache(ca) || - bch_tiering_init_cache(ca)) + bch_rebalance_init(c, ca)) goto err; ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); @@ -1957,20 +1915,6 @@ static const char *cache_alloc(struct bcache_superblock *sb, total_reserve += ca->free[i].size; pr_debug("%zu buckets reserved", total_reserve); - for (i = 0; i < ARRAY_SIZE(ca->gc_buckets); i++) { - ca->gc_buckets[i].reserve = RESERVE_MOVINGGC; - ca->gc_buckets[i].group = &ca->self; - } - - ca->tiering_write_point.reserve = RESERVE_NONE; - ca->tiering_write_point.group = &ca->self; - - /* XXX: scan keylists will die */ - bch_scan_keylist_init(&ca->moving_gc_queue.keys, c, - DFLT_SCAN_KEYLIST_MAX_SIZE); - bch_scan_keylist_init(&ca->tiering_queue.keys, c, - DFLT_SCAN_KEYLIST_MAX_SIZE); - kobject_get(&c->kobj); ca->set = c; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a901b5d8368a..446552c460ec 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -138,14 +138,14 @@ rw_attribute(cache_replacement_policy); rw_attribute(foreground_write_ratelimit_enabled); rw_attribute(copy_gc_enabled); -sysfs_queue_attribute(copy_gc); -sysfs_pd_controller_attribute(copy_gc); +//sysfs_queue_attribute(copy_gc); +//sysfs_pd_controller_attribute(copy_gc); rw_attribute(tiering_enabled); rw_attribute(tiering_percent); sysfs_pd_controller_attribute(tiering); -sysfs_queue_attribute(tiering); -rw_attribute(tiering_stripe_size); +//sysfs_queue_attribute(tiering); +//rw_attribute(tiering_stripe_size); sysfs_pd_controller_attribute(foreground_write); @@ -701,7 +701,7 @@ SHOW(bch_cache_set) sysfs_printf(tiering_enabled, "%i", c->tiering_enabled); sysfs_print(tiering_percent, c->tiering_percent); - sysfs_pd_controller_show(tiering, &c->tiering_pd); + //sysfs_pd_controller_show(tiering, &c->tiering_pd); sysfs_print(btree_flush_delay, c->btree_flush_delay); @@ -781,23 +781,26 @@ STORE(__bch_cache_set) c->foreground_write_ratelimit_enabled); if (attr == &sysfs_copy_gc_enabled) { - struct cache *ca; - unsigned i; ssize_t ret = strtoul_safe(buf, c->copy_gc_enabled) ?: (ssize_t) size; +#if 0 + struct cache *ca; + unsigned i; for_each_cache(ca, c, i) if (ca->moving_gc_read) wake_up_process(ca->moving_gc_read); +#endif return ret; } if (attr == &sysfs_tiering_enabled) { ssize_t ret = strtoul_safe(buf, c->tiering_enabled) ?: (ssize_t) size; - +#if 0 if (c->tiering_read) wake_up_process(c->tiering_read); +#endif return ret; } @@ -807,7 +810,6 @@ STORE(__bch_cache_set) if (attr == &sysfs_journal_flush) { bch_journal_meta_async(&c->journal, NULL); - return size; } @@ -816,7 +818,7 @@ STORE(__bch_cache_set) sysfs_strtoul(foreground_target_percent, c->foreground_target_percent); sysfs_strtoul(tiering_percent, c->tiering_percent); - sysfs_pd_controller_store(tiering, &c->tiering_pd); + //sysfs_pd_controller_store(tiering, &c->tiering_pd); /* Debugging: */ @@ -1210,13 +1212,13 @@ SHOW(bch_cache) sysfs_print(free_buckets, buckets_free_cache(ca, RESERVE_NONE)); sysfs_print(has_data, ca->mi.has_data); sysfs_print(has_metadata, ca->mi.has_metadata); - +#if 0 sysfs_pd_controller_show(copy_gc, &ca->moving_gc_pd); sysfs_queue_show(copy_gc, &ca->moving_gc_queue); sysfs_queue_show(tiering, &ca->tiering_queue); sysfs_print(tiering_stripe_size, ca->tiering_stripe_size); - +#endif if (attr == &sysfs_cache_replacement_policy) return bch_snprint_string_list(buf, PAGE_SIZE, cache_replacement_policies, @@ -1250,13 +1252,13 @@ STORE(__bch_cache) struct cache *ca = container_of(kobj, struct cache, kobj); struct cache_set *c = ca->set; struct cache_member *mi = &c->disk_mi[ca->sb.nr_this_dev]; - +#if 0 sysfs_pd_controller_store(copy_gc, &ca->moving_gc_pd); sysfs_queue_store(copy_gc, &ca->moving_gc_queue); sysfs_queue_store(tiering, &ca->tiering_queue); sysfs_strtoul(tiering_stripe_size, ca->tiering_stripe_size); - +#endif if (attr == &sysfs_discard) { bool v = strtoul_or_return(buf); @@ -1377,10 +1379,6 @@ static struct attribute *bch_cache_files[] = { &sysfs_state_rw, &sysfs_alloc_debug, - sysfs_pd_controller_files(copy_gc), - sysfs_queue_files(copy_gc), - sysfs_queue_files(tiering), - &sysfs_tiering_stripe_size, NULL }; KTYPE(bch_cache); diff --git a/fs/bcachefs/tier.c b/fs/bcachefs/tier.c deleted file mode 100644 index caf6b3df2c9c..000000000000 --- a/fs/bcachefs/tier.c +++ /dev/null @@ -1,466 +0,0 @@ - -#include "bcache.h" -#include "btree_iter.h" -#include "buckets.h" -#include "clock.h" -#include "extents.h" -#include "io.h" -#include "keylist.h" -#include "move.h" -#include "tier.h" - -#include <linux/delay.h> -#include <linux/freezer.h> -#include <linux/kthread.h> -#include <trace/events/bcache.h> - -/** - * tiering_pred - check if tiering should copy an extent to tier 1 - */ -static bool tiering_pred(struct scan_keylist *kl, struct bkey_s_c k) -{ - struct cache *ca = container_of(kl, struct cache, - tiering_queue.keys); - struct cache_set *c = ca->set; - - if (bkey_extent_is_data(k.k)) { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const struct bch_extent_ptr *ptr; - struct cache_member_rcu *mi; - unsigned replicas = 0; - - /* Make sure we have room to add a new pointer: */ - if (bkey_val_u64s(e.k) + BKEY_EXTENT_PTR_U64s_MAX > - BKEY_EXTENT_VAL_U64s_MAX) - return false; - - mi = cache_member_info_get(c); - extent_for_each_ptr(e, ptr) - if (ptr->dev < mi->nr_in_set && - mi->m[ptr->dev].tier) - replicas++; - cache_member_info_put(); - - return replicas < c->opts.data_replicas; - } - - return false; -} - -struct tiering_refill { - struct bpos start; - struct cache *ca; - int cache_iter; - u64 sectors; -}; - -static void refill_done(struct tiering_refill *refill) -{ - if (refill->ca) { - percpu_ref_put(&refill->ca->ref); - refill->ca = NULL; - } -} - -/** - * refill_next - move on to refilling the next cache's tiering keylist - */ -static void refill_next(struct cache_set *c, struct tiering_refill *refill) -{ - struct cache_group *tier; - - refill_done(refill); - - rcu_read_lock(); - tier = &c->cache_tiers[1]; - if (tier->nr_devices == 0) - goto out; - - while (1) { - while (refill->cache_iter < tier->nr_devices) { - refill->ca = rcu_dereference( - tier->devices[refill->cache_iter]); - if (refill->ca != NULL) { - percpu_ref_get(&refill->ca->ref); - goto out; - } - refill->cache_iter++; - } - - /* Reached the end, wrap around */ - refill->cache_iter = 0; - } - -out: - rcu_read_unlock(); -} - -/* - * refill_init - Start refilling a random cache device -- this ensures we - * distribute data sanely even if each tiering pass discovers only a few - * keys to tier - */ -static void refill_init(struct cache_set *c, struct tiering_refill *refill) -{ - struct cache_group *tier; - - memset(refill, 0, sizeof(*refill)); - refill->start = POS_MIN; - - rcu_read_lock(); - tier = &c->cache_tiers[1]; - if (tier->nr_devices != 0) - refill->cache_iter = bch_rand_range(tier->nr_devices); - rcu_read_unlock(); - - refill_next(c, refill); -} - -/** - * tiering_keylist_full - we accumulate tiering_stripe_size sectors in a cache - * device's tiering keylist before we move on to the next cache device - */ -static bool tiering_keylist_full(struct tiering_refill *refill) -{ - return (refill->sectors >= refill->ca->tiering_stripe_size); -} - -/** - * tiering_keylist_empty - to prevent a keylist from growing to more than twice - * the tiering stripe size, we stop refill when a keylist has more than a single - * stripe of sectors - */ -static bool tiering_keylist_empty(struct cache *ca) -{ - return (bch_scan_keylist_sectors(&ca->tiering_queue.keys) - <= ca->tiering_stripe_size); -} - -/** - * tiering_refill - to keep all queues busy as much as possible, we add - * up to a single stripe of sectors to each cache device's queue, iterating - * over all cache devices twice, so each one has two stripe's of writes - * queued up, before we have to wait for move operations to complete. - */ -static void tiering_refill(struct cache_set *c, struct tiering_refill *refill) -{ - struct scan_keylist *keys; - struct btree_iter iter; - struct bkey_s_c k; - - if (bkey_cmp(refill->start, POS_MAX) >= 0) - return; - - if (refill->ca == NULL) - return; - - if (!tiering_keylist_empty(refill->ca)) - return; - - trace_bcache_tiering_refill_start(c); - - for_each_btree_key(&iter, c, BTREE_ID_EXTENTS, refill->start, k) { - keys = &refill->ca->tiering_queue.keys; - - if (!tiering_pred(keys, k)) { - refill->start = k.k->p; - goto next; - } - - /* Growing the keylist might fail */ - if (bch_scan_keylist_add(keys, k)) - goto done; - - /* TODO: split key if refill->sectors is now > stripe_size */ - refill->sectors += k.k->size; - refill->start = k.k->p; - - /* Check if we've added enough keys to this keylist */ - if (tiering_keylist_full(refill)) { - /* Move on to refill the next cache device's keylist */ - refill->sectors = 0; - refill->cache_iter++; - refill_next(c, refill); - - /* All cache devices got removed somehow */ - if (refill->ca == NULL) - goto done; - - /* - * If the next cache's keylist is not sufficiently - * empty, wait for it to drain before refilling - * anything. We prioritize even distribution of data - * over maximizing write bandwidth. - */ - if (!tiering_keylist_empty(refill->ca)) - goto done; - } -next: - bch_btree_iter_cond_resched(&iter); - } - /* Reached the end of the keyspace */ - refill->start = POS_MAX; -done: - bch_btree_iter_unlock(&iter); - - trace_bcache_tiering_refill_end(c); -} - -static int issue_tiering_move(struct moving_queue *q, - struct moving_context *ctxt, - struct bkey_s_c k) -{ - struct cache *ca = container_of(q, struct cache, tiering_queue); - struct cache_set *c = ca->set; - struct moving_io *io; - - io = moving_io_alloc(k); - if (!io) { - trace_bcache_tiering_alloc_fail(c, k.k->size); - return -ENOMEM; - } - - bch_replace_init(&io->replace, bkey_i_to_s_c(&io->key)); - - bch_write_op_init(&io->op, c, &io->wbio, - (struct disk_reservation) { 0 }, - &ca->tiering_write_point, - bkey_i_to_s_c(&io->key), - &io->replace.hook, NULL, 0); - io->op.io_wq = q->wq; - io->op.nr_replicas = 1; - - trace_bcache_tiering_copy(k.k); - - /* - * IMPORTANT: We must call bch_data_move before we dequeue so - * that the key can always be found in either the pending list - * in the moving queue or in the scan keylist list in the - * moving queue. - * If we reorder, there is a window where a key is not found - * by btree gc marking. - */ - bch_data_move(q, ctxt, io); - bch_scan_keylist_dequeue(&q->keys); - return 0; -} - -/** - * tiering_next_cache - issue a move to write an extent to the next cache - * device in round robin order - */ -static int tiering_next_cache(struct cache_set *c, - int *cache_iter, - struct moving_context *ctxt, - struct tiering_refill *refill) -{ - struct cache_group *tier; - int start = *cache_iter; - struct cache *ca; - - /* If true at the end of the loop, all keylists were empty, so we - * have reached the end of the keyspace */ - bool done = true; - /* If true at the end of the loop, all queues were full, so we must - * wait for some ops to finish */ - bool full = true; - - do { - rcu_read_lock(); - tier = &c->cache_tiers[1]; - if (tier->nr_devices == 0) { - rcu_read_unlock(); - return 0; - } - - if (*cache_iter >= tier->nr_devices) { - rcu_read_unlock(); - *cache_iter = 0; - continue; - } - - ca = rcu_dereference(tier->devices[*cache_iter]); - if (ca == NULL || - ca->mi.state != CACHE_ACTIVE || - ca->tiering_queue.stopped) { - rcu_read_unlock(); - (*cache_iter)++; - continue; - } - - percpu_ref_get(&ca->ref); - rcu_read_unlock(); - (*cache_iter)++; - - tiering_refill(c, refill); - - if (bch_queue_full(&ca->tiering_queue)) { - done = false; - } else { - struct bkey_i *k = - bch_scan_keylist_next(&ca->tiering_queue.keys); - if (k) { - issue_tiering_move(&ca->tiering_queue, ctxt, - bkey_i_to_s_c(k)); - done = false; - full = false; - } - } - - percpu_ref_put(&ca->ref); - } while (*cache_iter != start); - - if (done) { - /* - * All devices have an empty keylist now, just wait for - * pending moves to finish and we're done. - */ - return 0; - } else if (full) { - /* - * No device with keys still remaining on its keylist has a - * queue that is not full. In this case, we have to wait for - * at least one read to complete before trying again. - * Otherwise, we could issue a read for this device. - */ - return -EAGAIN; - } else { - /* Try again immediately */ - return -EIOCBQUEUED; - } -} - -static u64 read_tiering(struct cache_set *c) -{ - struct moving_context ctxt; - struct tiering_refill refill; - int cache_iter = 0; - int ret; - - trace_bcache_tiering_start(c); - - refill_init(c, &refill); - - bch_moving_context_init(&ctxt, &c->tiering_pd.rate, - MOVING_PURPOSE_TIERING); - - while (!bch_moving_context_wait(&ctxt)) { - cond_resched(); - - ret = tiering_next_cache(c, &cache_iter, &ctxt, &refill); - if (ret == -EAGAIN) - bch_moving_wait(&ctxt); - else if (!ret) - break; - } - - closure_sync(&ctxt.cl); - refill_done(&refill); - - trace_bcache_tiering_end(c, ctxt.sectors_moved, ctxt.keys_moved); - - return ctxt.sectors_moved; -} - -static int bch_tiering_thread(void *arg) -{ - struct cache_set *c = arg; - struct io_clock *clock = &c->io_clock[WRITE]; - struct cache *ca; - u64 sectors, tier_capacity; - unsigned long last; - unsigned i; - - set_freezable(); - - while (!kthread_should_stop()) { - if (kthread_wait_freezable(c->tiering_enabled && - c->cache_tiers[1].nr_devices)) - break; - - last = atomic_long_read(&clock->now); - - sectors = read_tiering(c); - - tier_capacity = 0; - rcu_read_lock(); - group_for_each_cache_rcu(ca, &c->cache_tiers[0], i) - tier_capacity += - (ca->mi.nbuckets - - ca->mi.first_bucket) << ca->bucket_bits; - rcu_read_unlock(); - - if (sectors < tier_capacity >> 4) - bch_kthread_io_clock_wait(clock, - last + (tier_capacity >> 5)); - } - - return 0; -} - -#define TIERING_KEYS_MAX_SIZE DFLT_SCAN_KEYLIST_MAX_SIZE -#define TIERING_NR 64 -#define TIERING_READ_NR 8 -#define TIERING_WRITE_NR 32 - -void bch_tiering_init_cache_set(struct cache_set *c) -{ - bch_pd_controller_init(&c->tiering_pd); -} - -int bch_tiering_init_cache(struct cache *ca) -{ - ca->tiering_stripe_size = ca->mi.bucket_size * 2; - - return bch_queue_init(&ca->tiering_queue, - ca->set, - TIERING_KEYS_MAX_SIZE, - TIERING_NR, - TIERING_READ_NR, - TIERING_WRITE_NR, - false, - "bch_tier_write"); -} - -void bch_tiering_write_start(struct cache *ca) -{ - bch_queue_start(&ca->tiering_queue); -} - -int bch_tiering_read_start(struct cache_set *c) -{ - struct task_struct *t; - - t = kthread_create(bch_tiering_thread, c, "bch_tier_read"); - if (IS_ERR(t)) - return PTR_ERR(t); - - c->tiering_read = t; - wake_up_process(c->tiering_read); - - return 0; -} - -void bch_tiering_write_destroy(struct cache *ca) -{ - bch_queue_destroy(&ca->tiering_queue); -} - -void bch_tiering_write_stop(struct cache *ca) -{ - bch_queue_stop(&ca->tiering_queue); - - /* - * Make sure that it is empty so that gc marking doesn't keep - * marking stale entries from when last used. - */ - bch_scan_keylist_reset(&ca->tiering_queue.keys); -} - -void bch_tiering_read_stop(struct cache_set *c) -{ - if (!IS_ERR_OR_NULL(c->tiering_read)) { - kthread_stop(c->tiering_read); - c->tiering_read = NULL; - } -} diff --git a/fs/bcachefs/tier.h b/fs/bcachefs/tier.h deleted file mode 100644 index 57b4acf86fb5..000000000000 --- a/fs/bcachefs/tier.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef _BCACHE_TIER_H -#define _BCACHE_TIER_H - -void bch_tiering_init_cache_set(struct cache_set *); -int bch_tiering_init_cache(struct cache *); -int bch_tiering_read_start(struct cache_set *); -void bch_tiering_write_start(struct cache *); -void bch_tiering_write_destroy(struct cache *); -void bch_tiering_write_stop(struct cache *); -void bch_tiering_read_stop(struct cache_set *); - -#endif diff --git a/include/trace/events/bcache.h b/include/trace/events/bcache.h index 00bd60de2ce4..78eac03455a8 100644 --- a/include/trace/events/bcache.h +++ b/include/trace/events/bcache.h @@ -1060,6 +1060,7 @@ TRACE_EVENT(bcache_keyscan, __entry->end_inode, __entry->end_offset) ); +#if 0 /* Moving IO */ DECLARE_EVENT_CLASS(moving_io, @@ -1215,6 +1216,7 @@ DEFINE_EVENT(bkey, bcache_tiering_copy, TP_PROTO(const struct bkey *k), TP_ARGS(k) ); +#endif /* Background writeback */ |