diff options
Diffstat (limited to 'libbcachefs/btree_update.h')
-rw-r--r-- | libbcachefs/btree_update.h | 421 |
1 files changed, 421 insertions, 0 deletions
diff --git a/libbcachefs/btree_update.h b/libbcachefs/btree_update.h new file mode 100644 index 00000000..b18c44c7 --- /dev/null +++ b/libbcachefs/btree_update.h @@ -0,0 +1,421 @@ +#ifndef _BCACHE_BTREE_INSERT_H +#define _BCACHE_BTREE_INSERT_H + +#include "btree_cache.h" +#include "btree_iter.h" +#include "buckets.h" +#include "journal.h" +#include "vstructs.h" + +struct bch_fs; +struct bkey_format_state; +struct bkey_format; +struct btree; + +static inline void btree_node_reset_sib_u64s(struct btree *b) +{ + b->sib_u64s[0] = b->nr.live_u64s; + b->sib_u64s[1] = b->nr.live_u64s; +} + +struct btree_reserve { + struct disk_reservation disk_res; + unsigned nr; + struct btree *b[BTREE_RESERVE_MAX]; +}; + +void __bch2_btree_calc_format(struct bkey_format_state *, struct btree *); +bool bch2_btree_node_format_fits(struct bch_fs *c, struct btree *, + struct bkey_format *); + +/* Btree node freeing/allocation: */ + +/* + * Tracks a btree node that has been (or is about to be) freed in memory, but + * has _not_ yet been freed on disk (because the write that makes the new + * node(s) visible and frees the old hasn't completed yet) + */ +struct pending_btree_node_free { + bool index_update_done; + + __le64 seq; + enum btree_id btree_id; + unsigned level; + __BKEY_PADDED(key, BKEY_BTREE_PTR_VAL_U64s_MAX); +}; + +/* + * Tracks an in progress split/rewrite of a btree node and the update to the + * parent node: + * + * When we split/rewrite a node, we do all the updates in memory without + * waiting for any writes to complete - we allocate the new node(s) and update + * the parent node, possibly recursively up to the root. + * + * The end result is that we have one or more new nodes being written - + * possibly several, if there were multiple splits - and then a write (updating + * an interior node) which will make all these new nodes visible. + * + * Additionally, as we split/rewrite nodes we free the old nodes - but the old + * nodes can't be freed (their space on disk can't be reclaimed) until the + * update to the interior node that makes the new node visible completes - + * until then, the old nodes are still reachable on disk. + * + */ +struct btree_interior_update { + struct closure cl; + struct bch_fs *c; + + struct list_head list; + + /* What kind of update are we doing? */ + enum { + BTREE_INTERIOR_NO_UPDATE, + BTREE_INTERIOR_UPDATING_NODE, + BTREE_INTERIOR_UPDATING_ROOT, + BTREE_INTERIOR_UPDATING_AS, + } mode; + + /* + * BTREE_INTERIOR_UPDATING_NODE: + * The update that made the new nodes visible was a regular update to an + * existing interior node - @b. We can't write out the update to @b + * until the new nodes we created are finished writing, so we block @b + * from writing by putting this btree_interior update on the + * @b->write_blocked list with @write_blocked_list: + */ + struct btree *b; + struct list_head write_blocked_list; + + /* + * BTREE_INTERIOR_UPDATING_AS: btree node we updated was freed, so now + * we're now blocking another btree_interior_update + * @parent_as - btree_interior_update that's waiting on our nodes to finish + * writing, before it can make new nodes visible on disk + * @wait - list of child btree_interior_updates that are waiting on this + * btree_interior_update to make all the new nodes visible before they can free + * their old btree nodes + */ + struct btree_interior_update *parent_as; + struct closure_waitlist wait; + + /* + * We may be freeing nodes that were dirty, and thus had journal entries + * pinned: we need to transfer the oldest of those pins to the + * btree_interior_update operation, and release it when the new node(s) + * are all persistent and reachable: + */ + struct journal_entry_pin journal; + + u64 journal_seq; + + /* + * Nodes being freed: + * Protected by c->btree_node_pending_free_lock + */ + struct pending_btree_node_free pending[BTREE_MAX_DEPTH + GC_MERGE_NODES]; + unsigned nr_pending; + + /* Only here to reduce stack usage on recursive splits: */ + struct keylist parent_keys; + /* + * Enough room for btree_split's keys without realloc - btree node + * pointers never have crc/compression info, so we only need to acount + * for the pointers for three keys + */ + u64 inline_keys[BKEY_BTREE_PTR_U64s_MAX * 3]; +}; + +#define for_each_pending_btree_node_free(c, as, p) \ + list_for_each_entry(as, &c->btree_interior_update_list, list) \ + for (p = as->pending; p < as->pending + as->nr_pending; p++) + +void bch2_btree_node_free_inmem(struct btree_iter *, struct btree *); +void bch2_btree_node_free_never_inserted(struct bch_fs *, struct btree *); +void bch2_btree_open_bucket_put(struct bch_fs *c, struct btree *); + +struct btree *__bch2_btree_node_alloc_replacement(struct bch_fs *, + struct btree *, + struct bkey_format, + struct btree_reserve *); + +struct btree_interior_update * +bch2_btree_interior_update_alloc(struct bch_fs *); + +void bch2_btree_interior_update_will_free_node(struct bch_fs *, + struct btree_interior_update *, + struct btree *); + +void bch2_btree_set_root_initial(struct bch_fs *, struct btree *, + struct btree_reserve *); + +void bch2_btree_reserve_put(struct bch_fs *, struct btree_reserve *); +struct btree_reserve *bch2_btree_reserve_get(struct bch_fs *, + struct btree *, unsigned, + unsigned, struct closure *); + +int bch2_btree_root_alloc(struct bch_fs *, enum btree_id, struct closure *); + +/* Inserting into a given leaf node (last stage of insert): */ + +bool bch2_btree_bset_insert_key(struct btree_iter *, struct btree *, + struct btree_node_iter *, struct bkey_i *); +void bch2_btree_journal_key(struct btree_insert *trans, struct btree_iter *, + struct bkey_i *); + +static inline void *btree_data_end(struct bch_fs *c, struct btree *b) +{ + return (void *) b->data + btree_bytes(c); +} + +static inline struct bkey_packed *unwritten_whiteouts_start(struct bch_fs *c, + struct btree *b) +{ + return (void *) ((u64 *) btree_data_end(c, b) - b->whiteout_u64s); +} + +static inline struct bkey_packed *unwritten_whiteouts_end(struct bch_fs *c, + struct btree *b) +{ + return btree_data_end(c, b); +} + +static inline void *write_block(struct btree *b) +{ + return (void *) b->data + (b->written << 9); +} + +static inline bool bset_written(struct btree *b, struct bset *i) +{ + return (void *) i < write_block(b); +} + +static inline bool bset_unwritten(struct btree *b, struct bset *i) +{ + return (void *) i > write_block(b); +} + +static inline unsigned bset_end_sector(struct bch_fs *c, struct btree *b, + struct bset *i) +{ + return round_up(bset_byte_offset(b, vstruct_end(i)), + block_bytes(c)) >> 9; +} + +static inline size_t bch_btree_keys_u64s_remaining(struct bch_fs *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned used = bset_byte_offset(b, vstruct_end(i)) / sizeof(u64) + + b->whiteout_u64s + + b->uncompacted_whiteout_u64s; + unsigned total = c->sb.btree_node_size << 6; + + EBUG_ON(used > total); + + if (bset_written(b, i)) + return 0; + + return total - used; +} + +static inline unsigned btree_write_set_buffer(struct btree *b) +{ + /* + * Could buffer up larger amounts of keys for btrees with larger keys, + * pending benchmarking: + */ + return 4 << 10; +} + +static inline struct btree_node_entry *want_new_bset(struct bch_fs *c, + struct btree *b) +{ + struct bset *i = btree_bset_last(b); + unsigned offset = max_t(unsigned, b->written << 9, + bset_byte_offset(b, vstruct_end(i))); + ssize_t n = (ssize_t) btree_bytes(c) - (ssize_t) + (offset + sizeof(struct btree_node_entry) + + b->whiteout_u64s * sizeof(u64) + + b->uncompacted_whiteout_u64s * sizeof(u64)); + + EBUG_ON(offset > btree_bytes(c)); + + if ((unlikely(bset_written(b, i)) && n > 0) || + (unlikely(vstruct_bytes(i) > btree_write_set_buffer(b)) && + n > btree_write_set_buffer(b))) + return (void *) b->data + offset; + + return NULL; +} + +/* + * write lock must be held on @b (else the dirty bset that we were going to + * insert into could be written out from under us) + */ +static inline bool bch2_btree_node_insert_fits(struct bch_fs *c, + struct btree *b, unsigned u64s) +{ + if (btree_node_is_extents(b)) { + /* The insert key might split an existing key + * (bch2_insert_fixup_extent() -> BCH_EXTENT_OVERLAP_MIDDLE case: + */ + u64s += BKEY_EXTENT_U64s_MAX; + } + + return u64s <= bch_btree_keys_u64s_remaining(c, b); +} + +static inline void unreserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, bset(b, t))) { + EBUG_ON(b->uncompacted_whiteout_u64s < + bkeyp_key_u64s(&b->format, k)); + b->uncompacted_whiteout_u64s -= + bkeyp_key_u64s(&b->format, k); + } +} + +static inline void reserve_whiteout(struct btree *b, struct bset_tree *t, + struct bkey_packed *k) +{ + if (bset_written(b, bset(b, t))) { + BUG_ON(!k->needs_whiteout); + b->uncompacted_whiteout_u64s += + bkeyp_key_u64s(&b->format, k); + } +} + +void bch2_btree_insert_node(struct btree *, struct btree_iter *, + struct keylist *, struct btree_reserve *, + struct btree_interior_update *as); + +/* Normal update interface: */ + +struct btree_insert { + struct bch_fs *c; + struct disk_reservation *disk_res; + struct journal_res journal_res; + u64 *journal_seq; + struct extent_insert_hook *hook; + unsigned flags; + bool did_work; + + unsigned short nr; + struct btree_insert_entry { + struct btree_iter *iter; + struct bkey_i *k; + unsigned extra_res; + /* + * true if entire key was inserted - can only be false for + * extents + */ + bool done; + } *entries; +}; + +int __bch2_btree_insert_at(struct btree_insert *); + + +#define _TENTH_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, N, ...) N +#define COUNT_ARGS(...) _TENTH_ARG(__VA_ARGS__, 9, 8, 7, 6, 5, 4, 3, 2, 1) + +#define BTREE_INSERT_ENTRY(_iter, _k) \ + ((struct btree_insert_entry) { \ + .iter = (_iter), \ + .k = (_k), \ + .done = false, \ + }) + +#define BTREE_INSERT_ENTRY_EXTRA_RES(_iter, _k, _extra) \ + ((struct btree_insert_entry) { \ + .iter = (_iter), \ + .k = (_k), \ + .extra_res = (_extra), \ + .done = false, \ + }) + +/** + * bch_btree_insert_at - insert one or more keys at iterator positions + * @iter: btree iterator + * @insert_key: key to insert + * @disk_res: disk reservation + * @hook: extent insert callback + * + * Return values: + * -EINTR: locking changed, this function should be called again. Only returned + * if passed BTREE_INSERT_ATOMIC. + * -EROFS: filesystem read only + * -EIO: journal or btree node IO error + */ +#define bch2_btree_insert_at(_c, _disk_res, _hook, \ + _journal_seq, _flags, ...) \ + __bch2_btree_insert_at(&(struct btree_insert) { \ + .c = (_c), \ + .disk_res = (_disk_res), \ + .journal_seq = (_journal_seq), \ + .hook = (_hook), \ + .flags = (_flags), \ + .nr = COUNT_ARGS(__VA_ARGS__), \ + .entries = (struct btree_insert_entry[]) { \ + __VA_ARGS__ \ + }}) + +/* + * Don't drop/retake locks: instead return -EINTR if need to upgrade to intent + * locks, -EAGAIN if need to wait on btree reserve + */ +#define BTREE_INSERT_ATOMIC (1 << 0) + +/* Don't check for -ENOSPC: */ +#define BTREE_INSERT_NOFAIL (1 << 1) + +/* for copygc, or when merging btree nodes */ +#define BTREE_INSERT_USE_RESERVE (1 << 2) + +/* + * Insert is for journal replay: don't get journal reservations, or mark extents + * (bch_mark_key) + */ +#define BTREE_INSERT_JOURNAL_REPLAY (1 << 3) + +int bch2_btree_insert_list_at(struct btree_iter *, struct keylist *, + struct disk_reservation *, + struct extent_insert_hook *, u64 *, unsigned); + +static inline bool journal_res_insert_fits(struct btree_insert *trans, + struct btree_insert_entry *insert) +{ + unsigned u64s = 0; + struct btree_insert_entry *i; + + /* + * If we didn't get a journal reservation, we're in journal replay and + * we're not journalling updates: + */ + if (!trans->journal_res.ref) + return true; + + for (i = insert; i < trans->entries + trans->nr; i++) + u64s += jset_u64s(i->k->k.u64s + i->extra_res); + + return u64s <= trans->journal_res.u64s; +} + +int bch2_btree_insert_check_key(struct btree_iter *, struct bkey_i *); +int bch2_btree_insert(struct bch_fs *, enum btree_id, struct bkey_i *, + struct disk_reservation *, + struct extent_insert_hook *, u64 *, int flags); +int bch2_btree_update(struct bch_fs *, enum btree_id, + struct bkey_i *, u64 *); + +int bch2_btree_delete_range(struct bch_fs *, enum btree_id, + struct bpos, struct bpos, struct bversion, + struct disk_reservation *, + struct extent_insert_hook *, u64 *); + +int bch2_btree_node_rewrite(struct btree_iter *, struct btree *, struct closure *); + +#endif /* _BCACHE_BTREE_INSERT_H */ + |