summaryrefslogtreecommitdiff
path: root/Extents.mdwn
blob: a11f1afec68a1535ab080bfb129e0dfb8e594792 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167

Documentation for design of new extent format (in dev branch):

        /*
         * In extent bkeys, the value is a list of pointers (bch_extent_ptr), optionally
         * preceded by checksum/compression information (bch_extent_crc32 or
         * bch_extent_crc64).
         *
         * One major determining factor in the format of extents is how we handle and
         * represent extents that have been partially overwritten and thus trimmed:
         *
         * If an extent is not checksummed or compressed, when the extent is trimmed we
         * don't have to remember the extent we originally allocated and wrote: we can
         * merely adjust ptr->offset to point to the start of the start of the data that
         * is currently live. The size field in struct bkey records the current (live)
         * size of the extent, and is also used to mean "size of region on disk that we
         * point to" in this case.
         *
         * Thus an extent that is not checksummed or compressed will consist only of a
         * list of bch_extent_ptrs, with none of the fields in
         * bch_extent_crc32/bch_extent_crc64.
         *
         * When an extent is checksummed or compressed, it's not possible to read only
         * the data that is currently live: we have to read the entire extent that was
         * originally written, and then return only the part of the extent that is
         * currently live.
         *
         * Thus, in addition to the current size of the extent in struct bkey, we need
         * to store the size of the originally allocated space - this is the
         * compressed_size and uncompressed_size fields in bch_extent_crc32/64. Also,
         * when the extent is trimmed, instead of modifying the offset field of the
         * pointer, we keep a second smaller offset field - "offset into the original
         * extent of the currently live region".
         *
         * The other major determining factor is replication and data migration:
         *
         * Each pointer may have its own bch_extent_crc32/64. When doing a replicated
         * write, we will initially write all the replicas in the same format, with the
         * same checksum type and compression format - however, when copygc runs later (or
         * tiering/cache promotion, anything that moves data), it is not in general
         * going to rewrite all the pointers at once - one of the replicas may be in a
         * bucket on one device that has very little fragmentation while another lives
         * in a bucket that has become heavily fragmented, and thus is being rewritten
         * sooner than the rest.
         *
         * Thus it will only move a subset of the pointers (or in the case of
         * tiering/cache promotion perhaps add a single pointer without dropping any
         * current pointers), and if the extent has been partially overwritten it must
         * write only the currently live portion (or copygc would not be able to reduce
         * fragmentation!) - which necessitates a different bch_extent_crc format for
         * the new pointer.
         *
         * But in the interests of space efficiency, we don't want to store one
         * bch_extent_crc for each pointer if we don't have to.
         *
         * Thus, a bch_extent consists of bch_extent_crc32s, bch_extent_crc64s, and
         * bch_extent_ptrs appended arbitrarily one after the other. We determine the
         * type of a given entry with a scheme similar to utf8, encoding the type in the
         * position of the first set bit:
         *
         * bch_extent_crc32        - field_type 1
         * bch_extent_ptr          - field_type 10
         * bch_extent_crc64        - field_type 100
         *
         * We do it this way because bch_extent_crc32 is _very_ constrained on bits (and
         * bch_extent_crc64 is the least constrained).
         *
         * Then, each bch_extent_crc32/64 applies to the pointers that follow after it,
         * until the next bch_extent_crc32/64.
         *
         * If there are no bch_extent_crcs preceding a bch_extent_ptr, then that pointer
         * is neither checksummed nor compressed.
         */

        struct bch_extent_crc32 {
        #if defined(__LITTLE_ENDIAN_BITFIELD)
                __u32                   field_type:1,
                                        compressed_size:8,
                                        uncompressed_size:8,
                                        offset:7,
                                        csum_type:4,
                                        compression_type:4;
        #elif defined (__BIG_ENDIAN_BITFIELD)
                __u32                   csum_type:4,
                                        compression_type:4,
                                        offset:7,
                                        uncompressed_size:8,
                                        compressed_size:8,
                                        field_type:1;
        #endif
                __u32                   csum;
        };

        #define CRC32_EXTENT_SIZE_MAX   (1U << 7)

        struct bch_extent_crc64 {
        #if defined(__LITTLE_ENDIAN_BITFIELD)
                __u64                   field_type:3,
                                        compressed_size:18,
                                        uncompressed_size:18,
                                        offset:17,
                                        csum_type:4,
                                        compression_type:4;
        #elif defined (__BIG_ENDIAN_BITFIELD)
                __u64                   csum_type:4,
                                        compression_type:4,
                                        offset:17,
                                        uncompressed_size:18,
                                        compressed_size:18,
                                        field_type:3;
        #endif
                __u64                   csum;
        };

        #define CRC64_EXTENT_SIZE_MAX   (1U << 17)

        struct bch_extent_ptr {
                union {
                struct {
        #if defined(__LITTLE_ENDIAN_BITFIELD)
                        __u64           field_type:2,
                                        erasure_coded:1,
                                        offset:45, /* 16 petabytes */
                                        dev:8,
                                        gen:8;
        #elif defined (__BIG_ENDIAN_BITFIELD)
                        __u64           gen:8,
                                        dev:8,
                                        offset:45,
                                        erasure_coded:1,
                                        field_type:2;
        #endif
                };

                __u64                   _val;
                };
        };

        static inline struct bch_extent_ptr PTR(__u64 gen, __u64 offset, __u64 dev)
        {
                return (struct bch_extent_ptr) {
                        .gen        = gen,
                        .dev        = dev,
                        .offset     = offset,
                };
        }

        /* Dummy DEV numbers: */

        #define PTR_LOST_DEV              255 /* XXX: kill */

        enum {
                BCH_EXTENT                = 128,

                /*
                 * This is kind of a hack, we're overloading the type for a boolean that
                 * really should be part of the value - BCH_EXTENT and BCH_EXTENT_CACHED
                 * have the same value type:
                 */
                BCH_EXTENT_CACHED         = 129,
        };

        struct bch_extent {
                struct bch_val                v;
                struct bch_extent_ptr        ptr[0];
        };
        BKEY_VAL_TYPE(extent,                BCH_EXTENT);