summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2015-05-25 18:48:33 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2015-05-25 18:48:48 -0700
commit2b2fa5bcbe7e1413f62657f141fe112089609bab (patch)
tree790eba539c045887053ab6214100ce59851e43a2
parent8ab8c6c64b9cf56ab35f7f8c32c8b84c5ce7ba83 (diff)
extents design
-rw-r--r--Extents.mdwn168
1 files changed, 168 insertions, 0 deletions
diff --git a/Extents.mdwn b/Extents.mdwn
new file mode 100644
index 0000000..c6ec292
--- /dev/null
+++ b/Extents.mdwn
@@ -0,0 +1,168 @@
+
+Documentation for design of new extent format (in dev branch):
+
+ /*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the start of the data that
+ * is currently live. The size field in struct bkey records the current (live)
+ * size of the extent, and is also used to mean "size of region on disk that we
+ * point to" in this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8, encoding the type in the
+ * position of the first set bit:
+ *
+ * bch_extent_crc32 - field_type 1
+ * bch_extent_ptr - field_type 10
+ * bch_extent_crc64 - field_type 100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+ struct bch_extent_crc32 {
+ #if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 field_type:1,
+ compressed_size:8,
+ uncompressed_size:8,
+ offset:7,
+ csum_type:4,
+ compression_type:4;
+ #elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum_type:4,
+ compression_type:4,
+ offset:7,
+ uncompressed_size:8,
+ compressed_size:8,
+ field_type:1;
+ #endif
+ __u32 csum;
+ };
+
+ #define CRC32_EXTENT_SIZE_MAX (1U << 7)
+
+ struct bch_extent_crc64 {
+ #if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 field_type:3,
+ compressed_size:18,
+ uncompressed_size:18,
+ offset:17,
+ csum_type:4,
+ compression_type:4;
+ #elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_type:4,
+ compression_type:4,
+ offset:17,
+ uncompressed_size:18,
+ compressed_size:18,
+ field_type:3;
+ #endif
+ __u64 csum;
+ };
+
+ #define CRC64_EXTENT_SIZE_MAX (1U << 17)
+
+ struct bch_extent_ptr {
+ union {
+ struct {
+ #if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 field_type:2,
+ erasure_coded:1,
+ offset:45, /* 16 petabytes */
+ dev:8,
+ gen:8;
+ #elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:45,
+ erasure_coded:1,
+ field_type:2;
+ #endif
+ };
+
+ __u64 _val;
+ };
+ };
+
+ static inline struct bch_extent_ptr PTR(__u64 gen, __u64 offset, __u64 dev)
+ {
+ return (struct bch_extent_ptr) {
+ .gen = gen,
+ .dev = dev,
+ .offset = offset,
+ };
+ }
+
+ /* Dummy DEV numbers: */
+
+ #define PTR_LOST_DEV 255 /* XXX: kill */
+
+ enum {
+ BCH_EXTENT = 128,
+
+ /*
+ * This is kind of a hack, we're overloading the type for a boolean that
+ * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
+ * have the same value type:
+ */
+ BCH_EXTENT_CACHED = 129,
+ BCH_CEXTENT = 130,
+ };
+
+ struct bch_extent {
+ struct bch_val v;
+ struct bch_extent_ptr ptr[0];
+ };
+ BKEY_VAL_TYPE(extent, BCH_EXTENT);