summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@gmail.com>2015-05-25 18:51:04 -0700
committerKent Overstreet <kent.overstreet@gmail.com>2015-05-25 18:51:04 -0700
commitef13bd2f2717bc256023f342eaf0d78146219edb (patch)
treec8b73e0f8858561cb043a83ac3f7b67d579b9141
parent2b2fa5bcbe7e1413f62657f141fe112089609bab (diff)
reformat
-rw-r--r--Extents.mdwn330
1 files changed, 165 insertions, 165 deletions
diff --git a/Extents.mdwn b/Extents.mdwn
index c6ec292..b43985c 100644
--- a/Extents.mdwn
+++ b/Extents.mdwn
@@ -1,168 +1,168 @@
Documentation for design of new extent format (in dev branch):
- /*
- * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
- * preceded by checksum/compression information (bch_extent_crc32 or
- * bch_extent_crc64).
- *
- * One major determining factor in the format of extents is how we handle and
- * represent extents that have been partially overwritten and thus trimmed:
- *
- * If an extent is not checksummed or compressed, when the extent is trimmed we
- * don't have to remember the extent we originally allocated and wrote: we can
- * merely adjust ptr->offset to point to the start of the start of the data that
- * is currently live. The size field in struct bkey records the current (live)
- * size of the extent, and is also used to mean "size of region on disk that we
- * point to" in this case.
- *
- * Thus an extent that is not checksummed or compressed will consist only of a
- * list of bch_extent_ptrs, with none of the fields in
- * bch_extent_crc32/bch_extent_crc64.
- *
- * When an extent is checksummed or compressed, it's not possible to read only
- * the data that is currently live: we have to read the entire extent that was
- * originally written, and then return only the part of the extent that is
- * currently live.
- *
- * Thus, in addition to the current size of the extent in struct bkey, we need
- * to store the size of the originally allocated space - this is the
- * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
- * when the extent is trimmed, instead of modifying the offset field of the
- * pointer, we keep a second smaller offset field - "offset into the original
- * extent of the currently live region".
- *
- * The other major determining factor is replication and data migration:
- *
- * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
- * write, we will initially write all the replicas in the same format, with the
- * same checksum type and compression format - however, when copygc runs later (or
- * tiering/cache promotion, anything that moves data), it is not in general
- * going to rewrite all the pointers at once - one of the replicas may be in a
- * bucket on one device that has very little fragmentation while another lives
- * in a bucket that has become heavily fragmented, and thus is being rewritten
- * sooner than the rest.
- *
- * Thus it will only move a subset of the pointers (or in the case of
- * tiering/cache promotion perhaps add a single pointer without dropping any
- * current pointers), and if the extent has been partially overwritten it must
- * write only the currently live portion (or copygc would not be able to reduce
- * fragmentation!) - which necessitates a different bch_extent_crc format for
- * the new pointer.
- *
- * But in the interests of space efficiency, we don't want to store one
- * bch_extent_crc for each pointer if we don't have to.
- *
- * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
- * bch_extent_ptrs appended arbitrarily one after the other. We determine the
- * type of a given entry with a scheme similar to utf8, encoding the type in the
- * position of the first set bit:
- *
- * bch_extent_crc32 - field_type 1
- * bch_extent_ptr - field_type 10
- * bch_extent_crc64 - field_type 100
- *
- * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
- * bch_extent_crc64 is the least constrained).
- *
- * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
- * until the next bch_extent_crc32/64.
- *
- * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
- * is neither checksummed nor compressed.
- */
-
- struct bch_extent_crc32 {
- #if defined(__LITTLE_ENDIAN_BITFIELD)
- __u32 field_type:1,
- compressed_size:8,
- uncompressed_size:8,
- offset:7,
- csum_type:4,
- compression_type:4;
- #elif defined (__BIG_ENDIAN_BITFIELD)
- __u32 csum_type:4,
- compression_type:4,
- offset:7,
- uncompressed_size:8,
- compressed_size:8,
- field_type:1;
- #endif
- __u32 csum;
- };
-
- #define CRC32_EXTENT_SIZE_MAX (1U << 7)
-
- struct bch_extent_crc64 {
- #if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 field_type:3,
- compressed_size:18,
- uncompressed_size:18,
- offset:17,
- csum_type:4,
- compression_type:4;
- #elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 csum_type:4,
- compression_type:4,
- offset:17,
- uncompressed_size:18,
- compressed_size:18,
- field_type:3;
- #endif
- __u64 csum;
- };
-
- #define CRC64_EXTENT_SIZE_MAX (1U << 17)
-
- struct bch_extent_ptr {
- union {
- struct {
- #if defined(__LITTLE_ENDIAN_BITFIELD)
- __u64 field_type:2,
- erasure_coded:1,
- offset:45, /* 16 petabytes */
- dev:8,
- gen:8;
- #elif defined (__BIG_ENDIAN_BITFIELD)
- __u64 gen:8,
- dev:8,
- offset:45,
- erasure_coded:1,
- field_type:2;
- #endif
- };
-
- __u64 _val;
- };
- };
-
- static inline struct bch_extent_ptr PTR(__u64 gen, __u64 offset, __u64 dev)
- {
- return (struct bch_extent_ptr) {
- .gen = gen,
- .dev = dev,
- .offset = offset,
- };
- }
-
- /* Dummy DEV numbers: */
-
- #define PTR_LOST_DEV 255 /* XXX: kill */
-
- enum {
- BCH_EXTENT = 128,
-
- /*
- * This is kind of a hack, we're overloading the type for a boolean that
- * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
- * have the same value type:
- */
- BCH_EXTENT_CACHED = 129,
- BCH_CEXTENT = 130,
- };
-
- struct bch_extent {
- struct bch_val v;
- struct bch_extent_ptr ptr[0];
- };
- BKEY_VAL_TYPE(extent, BCH_EXTENT);
+ /*
+ * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
+ * preceded by checksum/compression information (bch_extent_crc32 or
+ * bch_extent_crc64).
+ *
+ * One major determining factor in the format of extents is how we handle and
+ * represent extents that have been partially overwritten and thus trimmed:
+ *
+ * If an extent is not checksummed or compressed, when the extent is trimmed we
+ * don't have to remember the extent we originally allocated and wrote: we can
+ * merely adjust ptr->offset to point to the start of the start of the data that
+ * is currently live. The size field in struct bkey records the current (live)
+ * size of the extent, and is also used to mean "size of region on disk that we
+ * point to" in this case.
+ *
+ * Thus an extent that is not checksummed or compressed will consist only of a
+ * list of bch_extent_ptrs, with none of the fields in
+ * bch_extent_crc32/bch_extent_crc64.
+ *
+ * When an extent is checksummed or compressed, it's not possible to read only
+ * the data that is currently live: we have to read the entire extent that was
+ * originally written, and then return only the part of the extent that is
+ * currently live.
+ *
+ * Thus, in addition to the current size of the extent in struct bkey, we need
+ * to store the size of the originally allocated space - this is the
+ * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
+ * when the extent is trimmed, instead of modifying the offset field of the
+ * pointer, we keep a second smaller offset field - "offset into the original
+ * extent of the currently live region".
+ *
+ * The other major determining factor is replication and data migration:
+ *
+ * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
+ * write, we will initially write all the replicas in the same format, with the
+ * same checksum type and compression format - however, when copygc runs later (or
+ * tiering/cache promotion, anything that moves data), it is not in general
+ * going to rewrite all the pointers at once - one of the replicas may be in a
+ * bucket on one device that has very little fragmentation while another lives
+ * in a bucket that has become heavily fragmented, and thus is being rewritten
+ * sooner than the rest.
+ *
+ * Thus it will only move a subset of the pointers (or in the case of
+ * tiering/cache promotion perhaps add a single pointer without dropping any
+ * current pointers), and if the extent has been partially overwritten it must
+ * write only the currently live portion (or copygc would not be able to reduce
+ * fragmentation!) - which necessitates a different bch_extent_crc format for
+ * the new pointer.
+ *
+ * But in the interests of space efficiency, we don't want to store one
+ * bch_extent_crc for each pointer if we don't have to.
+ *
+ * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
+ * bch_extent_ptrs appended arbitrarily one after the other. We determine the
+ * type of a given entry with a scheme similar to utf8, encoding the type in the
+ * position of the first set bit:
+ *
+ * bch_extent_crc32 - field_type 1
+ * bch_extent_ptr - field_type 10
+ * bch_extent_crc64 - field_type 100
+ *
+ * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
+ * bch_extent_crc64 is the least constrained).
+ *
+ * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
+ * until the next bch_extent_crc32/64.
+ *
+ * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
+ * is neither checksummed nor compressed.
+ */
+
+ struct bch_extent_crc32 {
+ #if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u32 field_type:1,
+ compressed_size:8,
+ uncompressed_size:8,
+ offset:7,
+ csum_type:4,
+ compression_type:4;
+ #elif defined (__BIG_ENDIAN_BITFIELD)
+ __u32 csum_type:4,
+ compression_type:4,
+ offset:7,
+ uncompressed_size:8,
+ compressed_size:8,
+ field_type:1;
+ #endif
+ __u32 csum;
+ };
+
+ #define CRC32_EXTENT_SIZE_MAX (1U << 7)
+
+ struct bch_extent_crc64 {
+ #if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 field_type:3,
+ compressed_size:18,
+ uncompressed_size:18,
+ offset:17,
+ csum_type:4,
+ compression_type:4;
+ #elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 csum_type:4,
+ compression_type:4,
+ offset:17,
+ uncompressed_size:18,
+ compressed_size:18,
+ field_type:3;
+ #endif
+ __u64 csum;
+ };
+
+ #define CRC64_EXTENT_SIZE_MAX (1U << 17)
+
+ struct bch_extent_ptr {
+ union {
+ struct {
+ #if defined(__LITTLE_ENDIAN_BITFIELD)
+ __u64 field_type:2,
+ erasure_coded:1,
+ offset:45, /* 16 petabytes */
+ dev:8,
+ gen:8;
+ #elif defined (__BIG_ENDIAN_BITFIELD)
+ __u64 gen:8,
+ dev:8,
+ offset:45,
+ erasure_coded:1,
+ field_type:2;
+ #endif
+ };
+
+ __u64 _val;
+ };
+ };
+
+ static inline struct bch_extent_ptr PTR(__u64 gen, __u64 offset, __u64 dev)
+ {
+ return (struct bch_extent_ptr) {
+ .gen = gen,
+ .dev = dev,
+ .offset = offset,
+ };
+ }
+
+ /* Dummy DEV numbers: */
+
+ #define PTR_LOST_DEV 255 /* XXX: kill */
+
+ enum {
+ BCH_EXTENT = 128,
+
+ /*
+ * This is kind of a hack, we're overloading the type for a boolean that
+ * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
+ * have the same value type:
+ */
+ BCH_EXTENT_CACHED = 129,
+ BCH_CEXTENT = 130,
+ };
+
+ struct bch_extent {
+ struct bch_val v;
+ struct bch_extent_ptr ptr[0];
+ };
+ BKEY_VAL_TYPE(extent, BCH_EXTENT);