summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2015-06-02 05:58:37 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2015-06-02 05:58:37 -0700
commitb9eabb7e6c019f4dcda5733035e39003ab84f984 (patch)
tree5a035e4c6ab80b6a0705947895ffdd7fc49e7344
parent8f4396af2880d8759f54374cf12d8fb0fe029754 (diff)
formatting
-rw-r--r--BcacheGuide.mdwn177
1 files changed, 89 insertions, 88 deletions
diff --git a/BcacheGuide.mdwn b/BcacheGuide.mdwn
index 0036670..f626a86 100644
--- a/BcacheGuide.mdwn
+++ b/BcacheGuide.mdwn
@@ -33,22 +33,22 @@ container that holds a key and a value (`struct bkey`).
Here are their definitions:
- struct bpos {
- u64 inode;
- u64 offset;
- u32 snapshot;
- };
-
- struct bkey {
- u8 u64s;
- u8 format;
- u8 type;
-
- u8 pad;
- u32 version;
- u32 size;
- struct bpos p;
- };
+ struct bpos {
+ u64 inode;
+ u64 offset;
+ u32 snapshot;
+ };
+
+ struct bkey {
+ u8 u64s;
+ u8 format;
+ u8 type;
+
+ u8 pad;
+ u32 version;
+ u32 size;
+ struct bpos p;
+ };
The high bits of the search key are the inode field and the low bits are the
snapshot field (note that on little endian the order in the struct will actually
@@ -88,6 +88,7 @@ btree code will unpack the key and return a wrapper type that contains pointers
to the unpacked key and the value.
The scheme for the wrapper types is:
+
bkey_i key with inline value
bkey_s key with split value (wrapper with pointers to key and value)
bkey_s_c constent key with split value
@@ -98,13 +99,13 @@ of the correct type.
For example, here's the struct definition for extended attributes:
- struct bch_xattr {
- struct bch_val v;
- u8 x_type;
- u8 x_name_len;
- u16 x_val_len;
- u8 x_name[];
- };
+ struct bch_xattr {
+ struct bch_val v;
+ u8 x_type;
+ u8 x_name_len;
+ u16 x_val_len;
+ u8 x_name[];
+ };
The wrapper types will be `bkey_i_xattr`, `bkey_s_xattr`, and `bkey_s_c_xattr`.
When the xattr code is doing a lookup in the btree, the btree iterator will
@@ -152,16 +153,16 @@ Some important properties/invariants:
Lookups/insertions are done via the btree iterator, defined in btree.h:
- struct btree_iter;
+ struct btree_iter;
- void bch_btree_iter_init(struct btree_iter *,
- struct cache_set *,
- enum btree_id,
- struct bpos);
- int bch_btree_iter_unlock(struct btree_iter *);
+ void bch_btree_iter_init(struct btree_iter *,
+ struct cache_set *,
+ enum btree_id,
+ struct bpos);
+ int bch_btree_iter_unlock(struct btree_iter *);
- struct bkey_s_c bch_btree_iter_peek(struct btree_iter *);
- void bch_btree_iter_advance_pos(struct btree_iter *);
+ struct bkey_s_c bch_btree_iter_peek(struct btree_iter *);
+ void bch_btree_iter_advance_pos(struct btree_iter *);
- A "`cache_set`" corresponds to a single filesystem/volume - the name coming
from bcache's starting point as a block cache, and turned into a `cache_set`
@@ -184,32 +185,32 @@ after the last key returned.
Lookup isn't provided as a primitive because most usage is geared more towards
iterating from a given position, but we now have enough to implement it:
- int lookup(struct bpos search)
- {
- struct btree_iter iter;
- struct bkey_s_c k;
- int ret = 0;
+ int lookup(struct bpos search)
+ {
+ struct btree_iter iter;
+ struct bkey_s_c k;
+ int ret = 0;
- bch_btree_iter_init(&iter, c, BTREE_ID_EXAMPLE, search);
+ bch_btree_iter_init(&iter, c, BTREE_ID_EXAMPLE, search);
- k = bch_btree_iter_peek(&iter);
- if (!k.k || bkey_cmp(k.k->p, search)
- ret = -EEXIST;
-
- bch_btree_iter_unlock(&iter);
- return ret;
- }
+ k = bch_btree_iter_peek(&iter);
+ if (!k.k || bkey_cmp(k.k->p, search)
+ ret = -EEXIST;
+
+ bch_btree_iter_unlock(&iter);
+ return ret;
+ }
If we want to iterate over every key in the btree (or just from a given point),
there's the convenience macro `for_each_btree_key()`:
- struct btree_iter iter;
- struct bkey_s_c k;
+ struct btree_iter iter;
+ struct bkey_s_c k;
- for_each_btree_key(&iter, c, BTREE_ID_EXAMPLE, POS_MIN, k)
- printk("got %llu:%llu\n", k.k->p.inode, k.k->p.offset);
+ for_each_btree_key(&iter, c, BTREE_ID_EXAMPLE, POS_MIN, k)
+ printk("got %llu:%llu\n", k.k->p.inode, k.k->p.offset);
- bch_btree_iter_unlock(&iter);
+ bch_btree_iter_unlock(&iter);
Insertion is most often done with `bch_btree_insert_at()`, which takes an
iterator and inserts at the iterator's current position. This is often used in
@@ -257,17 +258,17 @@ Here's a simplified version (in the latest version the extent format has gotten
considerably more complicated in order to incorporate checksums and compression
information, but the concept is still the same):
- struct bch_extent_ptr {
- __u64 offset:48,
- dev:8,
- gen:8;
- };
+ struct bch_extent_ptr {
+ __u64 offset:48,
+ dev:8,
+ gen:8;
+ };
- struct bch_extent {
- struct bch_val v;
+ struct bch_extent {
+ struct bch_val v;
- struct bch_extent_ptr ptr[0]
- };
+ struct bch_extent_ptr ptr[0]
+ };
- The device field is the index of one of the devices in the cache set (i.e.
volume/filesystem).
@@ -333,14 +334,14 @@ This has a number of implications:
By this point, the IO path should be fairly straightforward.
-The core read path starts in bch_read(), in io.c. The algorithm goes like:
+The core read path starts in `bch_read()`, in io.c. The algorithm goes like:
- - Iterate over the extents btree (with for_each_btree_key_with_holes()),
+ - Iterate over the extents btree (with `for_each_btree_key_with_holes()`),
starting from the first sector in the request.
- Check the type of the returned key:
- - Hole? (KEY_TYPE_DELETED) - just return zeroes
+ - Hole? (`KEY_TYPE_DELETED`) - just return zeroes
- Extent? Check for a pointer we can read from.
@@ -350,8 +351,8 @@ The core read path starts in bch_read(), in io.c. The algorithm goes like:
- If the relevant device is missing/not online, return an error.
- Ok, we have a pointer we can read from. If the extent is smaller than
- the request, split the request (with bio_split()), and issue the request
- to the appropriate device:sector.
+ the request, split the request (with `bio_split()`), and issue the
+ request to the appropriate device:sector.
- Iterate until entire request has been handled.
@@ -361,7 +362,7 @@ basic algorithm is fairly simple there too:
- Set up a key that'll represent the data we're going to write (not the
pointers yet).
- - Allocate some space to write to (with bch_alloc_sectors()); add the
+ - Allocate some space to write to (with `bch_alloc_sectors()`); add the
pointer(s) to the space we allocated to the key we set up previously.
- Were we able to allocate as much space as we wanted? If not, split both the
@@ -411,15 +412,15 @@ make sure the dead parts of the extent aren't overwritten.
So, for compressed or checksummed extents here's the additional information
we'll need to store in struct bch_extent:
- struct bch_extent_crc32 {
- __u32 type:1,
- offset:7,
- compressed_size:8,
- uncompressed_size:8,
- csum_type:4,
- compression_type:4;
- __u32 csum;
- };
+ struct bch_extent_crc32 {
+ __u32 type:1,
+ offset:7,
+ compressed_size:8,
+ uncompressed_size:8,
+ csum_type:4,
+ compression_type:4;
+ __u32 csum;
+ };
Now, when trimming an extent, instead of modifying the pointer offset to point
to the start of the new live region (as we still do for non
@@ -527,7 +528,7 @@ where the bkeys are all contiguous in memory and in sorted order:
};
Since bkeys are variable length, it's not possible to access keys randomly
-without other data structures - only iterate sequentially via bkey_next().
+without other data structures - only iterate sequentially via `bkey_next()`.
A btree node thus contains multiple independent bsets that on lookup all must be
searched and iterated over. At any given time there will be zero or more bsets
@@ -537,7 +538,7 @@ inserted into.
As the btree is modified we must maintain the invariant in memory that there are
no duplicates (keys that compare as equal), excluding keys marked as deleted.
When an insertion overwrites an existing key, we will mark the existing key as
-deleted (by setting k->type = KEY_TYPE_DELETED) - but until the entire node is
+deleted (by setting `k->type = KEY_TYPE_DELETED`) - but until the entire node is
rewritten the old key will still exist on disk. To handle this, when a btree
node is read, the first thing we do is a mergesort of all the bsets it contains,
and as part of the mergesort duplicate keys are found and the older bkeys are
@@ -601,14 +602,14 @@ out, and now needs to take a write lock on the parent in order to add the
pointers to the new nodes (after which it will free the old child).
Thread A just wants to insert into the child node - it has a read lock on the
-parent node and it's looked up the child node, and now it's waiting (on thread
-B) to get an intent lock on the child.
+parent node and it's looked up the child node, and now it's waiting on thread
+B to get an intent lock on the child.
But thread A has blocked thread B from taking its write lock in order to update
the parent node, and thread B can't drop its intent lock on the child until
after the new nodes are visible and it has freed the child node.
-The way this deadlock is handled is by enforcing (in bch_btree_node_get()) that
+The way this deadlock is handled is by enforcing (in `bch_btree_node_get()`) that
we drop read locks on parent nodes _before_ taking intent locks on child nodes -
this might cause us to race have the btree node freed out from under us before
we lock it, so we check for that after grabbing the intent lock and redo the
@@ -617,10 +618,10 @@ traversal if necessary.
One other thing worth mentioning is bcache's btree node locks have embedded
sequence numbers, which are incremented when taking and releasing write locks
(much like seqlocks). This allows us to aggressively drop locks (because we'll
-usually be able to retake the lock), and we also use it for a try_upgrade() - if
-we discover we need an intent lock (e.g. for a split, or because the caller is
-inserting into a leaf node they didn't get an intent lock for) we'll usually be
-able to get it without having to unwind and redo the traversal.
+usually be able to retake the lock), and we also use it for a `try_upgrade()` -
+if we discover we need an intent lock (e.g. for a split, or because the caller
+is inserting into a leaf node they didn't get an intent lock for) we'll usually
+be able to get it without having to unwind and redo the traversal.
### Journaling
@@ -632,15 +633,15 @@ asynchronously.
The journal is a purely logical log, a list of insertions - bkeys - to reinsert
on recovery in the same order they're present in the journal. Provided every
index update is journaled (a critical invariant here), redoing those insertions
-is an idempotant operation. See bch_journal_replay_key() in journal.c - the
+is an idempotant operation. See `bch_journal_replay_key()` in journal.c - the
journal uses the same insert path as everything else and doesn't know anything
about the structure of the btree.
It's critical that keys appear in the journal in the same order as the
insertions happened, so both are done under the btree node's write lock: see
-bch_btree_insert_and_journal() in btree.c, which calls bch_bset_insert() at the
-top (inserting into the btree node in memory) and bch_journal_add_keys() to
-journal the same key at the bottom.
+`bch_btree_insert_and_journal()` in btree.c, which calls `bch_bset_insert()` at
+the top (inserting into the btree node in memory) and `bch_journal_add_keys()`
+to journal the same key at the bottom.
At this point in the btree insertion path, we've already marked any key(s) that
we overlapped with (possibly more than one for extents) as deleted - i.e. the
@@ -649,7 +650,7 @@ write lock.
So we also have some machinery for journal reservations: we reserve some amount
of space in the journal (in units of u64s, as always) midway through the
-insertion path (in bch_btree_insert_keys()). The journal reservation machinery
+insertion path (in `bch_btree_insert_keys()`). The journal reservation machinery
(as well as adding the keys to the journal) is entirely lockless in the
fastpath.
@@ -730,7 +731,7 @@ Then there are a couple tricks we use to make these nodes as small as possible:
can compute given a node's position in the array/tree its index in an inorder
traversal, we only have to store the key's offset within that cacheline.
- This is done by to_inorder() in bset.c, and it's mostly just shifts and bit
+ This is done by `to_inorder()` in bset.c, and it's mostly just shifts and bit
operations.
- Observe that as we're doing the lookup and walking down the tree, we have