From f656fa4013388a52d1fdd82268955c5e7be63ad2 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 27 Jan 2025 14:15:33 -0800 Subject: dm-crypt: switch to using the crc32 library Now that the crc32() library function takes advantage of architecture-specific optimizations, it is unnecessary to go through the crypto API. Just use crc32(). This is much simpler, and it improves performance due to eliminating the crypto API overhead. (However, this only affects the TCW IV mode of dm-crypt, which is a compatibility mode that is rarely used compared to other dm-crypt modes.) Signed-off-by: Eric Biggers Signed-off-by: Mikulas Patocka --- drivers/md/Kconfig | 1 + drivers/md/dm-crypt.c | 41 ++++++++++------------------------------- 2 files changed, 11 insertions(+), 31 deletions(-) diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index 0b1870a09e1f..06f809e70f15 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig @@ -267,6 +267,7 @@ config DM_CRYPT depends on BLK_DEV_DM depends on (ENCRYPTED_KEYS || ENCRYPTED_KEYS=n) depends on (TRUSTED_KEYS || TRUSTED_KEYS=n) + select CRC32 select CRYPTO select CRYPTO_CBC select CRYPTO_ESSIV diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 02a2919f4e5a..9dfdb63220d7 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -125,7 +126,6 @@ struct iv_lmk_private { #define TCW_WHITENING_SIZE 16 struct iv_tcw_private { - struct crypto_shash *crc32_tfm; u8 *iv_seed; u8 *whitening; }; @@ -607,10 +607,6 @@ static void crypt_iv_tcw_dtr(struct crypt_config *cc) tcw->iv_seed = NULL; kfree_sensitive(tcw->whitening); tcw->whitening = NULL; - - if (tcw->crc32_tfm && !IS_ERR(tcw->crc32_tfm)) - crypto_free_shash(tcw->crc32_tfm); - tcw->crc32_tfm = NULL; } static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, @@ -628,13 +624,6 @@ static int crypt_iv_tcw_ctr(struct crypt_config *cc, struct dm_target *ti, return -EINVAL; } - tcw->crc32_tfm = crypto_alloc_shash("crc32", 0, - CRYPTO_ALG_ALLOCATES_MEMORY); - if (IS_ERR(tcw->crc32_tfm)) { - ti->error = "Error initializing CRC32 in TCW"; - return PTR_ERR(tcw->crc32_tfm); - } - tcw->iv_seed = kzalloc(cc->iv_size, GFP_KERNEL); tcw->whitening = kzalloc(TCW_WHITENING_SIZE, GFP_KERNEL); if (!tcw->iv_seed || !tcw->whitening) { @@ -668,36 +657,28 @@ static int crypt_iv_tcw_wipe(struct crypt_config *cc) return 0; } -static int crypt_iv_tcw_whitening(struct crypt_config *cc, - struct dm_crypt_request *dmreq, - u8 *data) +static void crypt_iv_tcw_whitening(struct crypt_config *cc, + struct dm_crypt_request *dmreq, u8 *data) { struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 buf[TCW_WHITENING_SIZE]; - SHASH_DESC_ON_STACK(desc, tcw->crc32_tfm); - int i, r; + int i; /* xor whitening with sector number */ crypto_xor_cpy(buf, tcw->whitening, (u8 *)§or, 8); crypto_xor_cpy(&buf[8], tcw->whitening + 8, (u8 *)§or, 8); /* calculate crc32 for every 32bit part and xor it */ - desc->tfm = tcw->crc32_tfm; - for (i = 0; i < 4; i++) { - r = crypto_shash_digest(desc, &buf[i * 4], 4, &buf[i * 4]); - if (r) - goto out; - } + for (i = 0; i < 4; i++) + put_unaligned_le32(crc32(0, &buf[i * 4], 4), &buf[i * 4]); crypto_xor(&buf[0], &buf[12], 4); crypto_xor(&buf[4], &buf[8], 4); /* apply whitening (8 bytes) to whole sector */ for (i = 0; i < ((1 << SECTOR_SHIFT) / 8); i++) crypto_xor(data + i * 8, buf, 8); -out: memzero_explicit(buf, sizeof(buf)); - return r; } static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, @@ -707,13 +688,12 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, struct iv_tcw_private *tcw = &cc->iv_gen_private.tcw; __le64 sector = cpu_to_le64(dmreq->iv_sector); u8 *src; - int r = 0; /* Remove whitening from ciphertext */ if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) { sg = crypt_get_sg_data(cc, dmreq->sg_in); src = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, src + sg->offset); kunmap_local(src); } @@ -723,7 +703,7 @@ static int crypt_iv_tcw_gen(struct crypt_config *cc, u8 *iv, crypto_xor_cpy(&iv[8], tcw->iv_seed + 8, (u8 *)§or, cc->iv_size - 8); - return r; + return 0; } static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, @@ -731,7 +711,6 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, { struct scatterlist *sg; u8 *dst; - int r; if (bio_data_dir(dmreq->ctx->bio_in) != WRITE) return 0; @@ -739,10 +718,10 @@ static int crypt_iv_tcw_post(struct crypt_config *cc, u8 *iv, /* Apply whitening on ciphertext */ sg = crypt_get_sg_data(cc, dmreq->sg_out); dst = kmap_local_page(sg_page(sg)); - r = crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); + crypt_iv_tcw_whitening(cc, dmreq, dst + sg->offset); kunmap_local(dst); - return r; + return 0; } static int crypt_iv_random_gen(struct crypt_config *cc, u8 *iv, -- cgit v1.2.3 From 82596487635012460c19d4dd257d5d59147cbf27 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 29 Jan 2025 13:58:55 +0100 Subject: dm-verity: Document restart_on_error and panic_on_error options This patch adds documentation for options introduced in commit f811b83879fb ("dm-verity: introduce the options restart_on_error and panic_on_error"). Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka --- Documentation/admin-guide/device-mapper/verity.rst | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index a65c1602cb23..fb3a045d0c72 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -87,6 +87,15 @@ panic_on_corruption Panic the device when a corrupted block is discovered. This option is not compatible with ignore_corruption and restart_on_corruption. +restart_on_error + Restart the system when an I/O error is detected. + This option can be combined with the restart_on_corruption option. + +panic_on_error + Panic the device when an I/O error is detected. This option is + not compatible with the restart_on_error option but can be combined + with the panic_on_corruption option. + ignore_zero_blocks Do not verify blocks that are expected to contain zeroes and always return zeroes instead. This may be useful if the partition contains unused blocks -- cgit v1.2.3 From d8955df3837f7ae1b555a66e5153235f6919b7a5 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 29 Jan 2025 13:58:56 +0100 Subject: dm-integrity: Document Inline mode for storing integrity data This patch adds documentation for new 'I' mode for dm-integrity introduced in commit fb0987682c62 ("dm-integrity: introduce the Inline mode"). Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka --- Documentation/admin-guide/device-mapper/dm-integrity.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/dm-integrity.rst b/Documentation/admin-guide/device-mapper/dm-integrity.rst index d8a5f14d0e3c..c2e18ecc065c 100644 --- a/Documentation/admin-guide/device-mapper/dm-integrity.rst +++ b/Documentation/admin-guide/device-mapper/dm-integrity.rst @@ -92,6 +92,11 @@ Target arguments: allowed. This mode is useful for data recovery if the device cannot be activated in any of the other standard modes. + I - inline mode - in this mode, dm-integrity will store integrity + data directly in the underlying device sectors. + The underlying device must have an integrity profile that + allows storing user integrity data and provides enough + space for the selected integrity tag. 5. the number of additional arguments -- cgit v1.2.3 From 8892606045fda29be7adfc5fba21bb1dfe079b93 Mon Sep 17 00:00:00 2001 From: Milan Broz Date: Wed, 29 Jan 2025 13:58:57 +0100 Subject: dm-crypt: Document integrity_key_size option. This patch adds documentation for new option introduced in commit 4441686b24a1 ("dm-crypt: Allow to specify the integrity key size as option"). Signed-off-by: Milan Broz Signed-off-by: Mikulas Patocka --- Documentation/admin-guide/device-mapper/dm-crypt.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/admin-guide/device-mapper/dm-crypt.rst b/Documentation/admin-guide/device-mapper/dm-crypt.rst index 9f8139ff97d6..4467f6d4b632 100644 --- a/Documentation/admin-guide/device-mapper/dm-crypt.rst +++ b/Documentation/admin-guide/device-mapper/dm-crypt.rst @@ -146,6 +146,11 @@ integrity:: integrity for the encrypted device. The additional space is then used for storing authentication tag (and persistent IV if needed). +integrity_key_size: + Optionally set the integrity key size if it differs from the digest size. + It allows the use of wrapped key algorithms where the key size is + independent of the cryptographic key size. + sector_size: Use as the encryption unit instead of 512 bytes sectors. This option can be in range 512 - 4096 bytes and must be power of two. -- cgit v1.2.3 From 3280c9313c9adce01550cc9f00edfb1dc7c744da Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Wed, 29 Jan 2025 18:26:05 -0500 Subject: dm vdo: use a short static string for thread name prefix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Also remove MODULE_NAME and a BUG_ON check, both unneeded. This fixes a warning about string truncation in snprintf that will never happen in practice: drivers/md/dm-vdo/vdo.c: In function ‘vdo_make’: drivers/md/dm-vdo/vdo.c:564:5: error: ‘%s’ directive output may be truncated writing up to 55 bytes into a region of size 16 [-Werror=format-truncation=] "%s%u", MODULE_NAME, instance); ^~ drivers/md/dm-vdo/vdo.c:563:2: note: ‘snprintf’ output between 2 and 66 bytes into a destination of size 16 snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ "%s%u", MODULE_NAME, instance); ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Reported-by: John Garry Reviewed-by: John Garry Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/vdo.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/md/dm-vdo/vdo.c b/drivers/md/dm-vdo/vdo.c index a7e32baab4af..80b608674022 100644 --- a/drivers/md/dm-vdo/vdo.c +++ b/drivers/md/dm-vdo/vdo.c @@ -31,9 +31,7 @@ #include #include -#include #include -#include #include #include #include @@ -142,12 +140,6 @@ static void finish_vdo_request_queue(void *ptr) vdo_unregister_allocating_thread(); } -#ifdef MODULE -#define MODULE_NAME THIS_MODULE->name -#else -#define MODULE_NAME "dm-vdo" -#endif /* MODULE */ - static const struct vdo_work_queue_type default_queue_type = { .start = start_vdo_request_queue, .finish = finish_vdo_request_queue, @@ -559,8 +551,7 @@ int vdo_make(unsigned int instance, struct device_config *config, char **reason, *vdo_ptr = vdo; snprintf(vdo->thread_name_prefix, sizeof(vdo->thread_name_prefix), - "%s%u", MODULE_NAME, instance); - BUG_ON(vdo->thread_name_prefix[0] == '\0'); + "vdo%u", instance); result = vdo_allocate(vdo->thread_config.thread_count, struct vdo_thread, __func__, &vdo->threads); if (result != VDO_SUCCESS) { -- cgit v1.2.3 From f4e99b846c90163d350c69d6581ac38dd5818eb8 Mon Sep 17 00:00:00 2001 From: Chung Chung Date: Wed, 29 Jan 2025 18:27:12 -0500 Subject: dm vdo indexer: prevent unterminated string warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix array initialization that triggers a warning: error: initializer-string for array of ‘unsigned char’ is too long [-Werror=unterminated-string-initialization] Signed-off-by: Chung Chung Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/indexer/index-layout.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm-vdo/indexer/index-layout.c b/drivers/md/dm-vdo/indexer/index-layout.c index af8fab83b0f3..61edf2b72427 100644 --- a/drivers/md/dm-vdo/indexer/index-layout.c +++ b/drivers/md/dm-vdo/indexer/index-layout.c @@ -54,7 +54,6 @@ * Each save also has a unique nonce. */ -#define MAGIC_SIZE 32 #define NONCE_INFO_SIZE 32 #define MAX_SAVES 2 @@ -98,9 +97,11 @@ enum region_type { #define SUPER_VERSION_CURRENT 3 #define SUPER_VERSION_MAXIMUM 7 -static const u8 LAYOUT_MAGIC[MAGIC_SIZE] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; +static const u8 LAYOUT_MAGIC[] = "*ALBIREO*SINGLE*FILE*LAYOUT*001*"; static const u64 REGION_MAGIC = 0x416c6252676e3031; /* 'AlbRgn01' */ +#define MAGIC_SIZE (sizeof(LAYOUT_MAGIC) - 1) + struct region_header { u64 magic; u64 region_blocks; -- cgit v1.2.3 From 148a9cec84d06cbeb6e1fa7aca3f1603d1771c0e Mon Sep 17 00:00:00 2001 From: Matthew Sakai Date: Fri, 31 Jan 2025 20:50:07 -0500 Subject: dm vdo: remove checks that can not fail Remove checks that can't fail. Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/constants.h | 3 --- drivers/md/dm-vdo/encodings.c | 20 +------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/drivers/md/dm-vdo/constants.h b/drivers/md/dm-vdo/constants.h index a8c4d6e24b38..2a8b03779f87 100644 --- a/drivers/md/dm-vdo/constants.h +++ b/drivers/md/dm-vdo/constants.h @@ -44,9 +44,6 @@ enum { /* The default size of each slab journal, in blocks */ DEFAULT_VDO_SLAB_JOURNAL_SIZE = 224, - /* Unit test minimum */ - MINIMUM_VDO_SLAB_JOURNAL_BLOCKS = 2, - /* * The initial size of lbn_operations and pbn_operations, which is based upon the expected * maximum number of outstanding VIOs. This value was chosen to make it highly unlikely diff --git a/drivers/md/dm-vdo/encodings.c b/drivers/md/dm-vdo/encodings.c index 100e92f8f866..b7cc0f41caca 100644 --- a/drivers/md/dm-vdo/encodings.c +++ b/drivers/md/dm-vdo/encodings.c @@ -711,24 +711,11 @@ int vdo_configure_slab(block_count_t slab_size, block_count_t slab_journal_block ref_blocks = vdo_get_saved_reference_count_size(slab_size - slab_journal_blocks); meta_blocks = (ref_blocks + slab_journal_blocks); - /* Make sure test code hasn't configured slabs to be too small. */ + /* Make sure configured slabs are not too small. */ if (meta_blocks >= slab_size) return VDO_BAD_CONFIGURATION; - /* - * If the slab size is very small, assume this must be a unit test and override the number - * of data blocks to be a power of two (wasting blocks in the slab). Many tests need their - * data_blocks fields to be the exact capacity of the configured volume, and that used to - * fall out since they use a power of two for the number of data blocks, the slab size was - * a power of two, and every block in a slab was a data block. - * - * TODO: Try to figure out some way of structuring testParameters and unit tests so this - * hack isn't needed without having to edit several unit tests every time the metadata size - * changes by one block. - */ data_blocks = slab_size - meta_blocks; - if ((slab_size < 1024) && !is_power_of_2(data_blocks)) - data_blocks = ((block_count_t) 1 << ilog2(data_blocks)); /* * Configure the slab journal thresholds. The flush threshold is 168 of 224 blocks in @@ -1221,11 +1208,6 @@ int vdo_validate_config(const struct vdo_config *config, if (result != VDO_SUCCESS) return result; - result = VDO_ASSERT(config->slab_journal_blocks >= MINIMUM_VDO_SLAB_JOURNAL_BLOCKS, - "slab journal size meets minimum size"); - if (result != VDO_SUCCESS) - return result; - result = VDO_ASSERT(config->slab_journal_blocks <= config->slab_size, "slab journal size is within expected bound"); if (result != VDO_SUCCESS) -- cgit v1.2.3 From 2b515cea77e370332715034401d2b7360ef4d4bc Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Fri, 31 Jan 2025 21:18:03 -0500 Subject: dm vdo vio-pool: add a pool pointer to pooled_vio This allows us to simplify the return_vio_to_pool interface. Also, we don't need to use vdo_forget on local variables or arguments that are about to go out of scope anyway. Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/block-map.c | 9 ++++----- drivers/md/dm-vdo/slab-depot.c | 15 +++++++-------- drivers/md/dm-vdo/vio.c | 8 +++++--- drivers/md/dm-vdo/vio.h | 4 +++- 4 files changed, 19 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 89cb7942ec5c..bc836f95f8b5 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -1544,7 +1544,7 @@ static void write_page_if_not_dirtied(struct vdo_waiter *waiter, void *context) static void return_to_pool(struct block_map_zone *zone, struct pooled_vio *vio) { - return_vio_to_pool(zone->vio_pool, vio); + return_vio_to_pool(vio); check_for_drain_complete(zone); } @@ -1837,7 +1837,7 @@ static void finish_block_map_page_load(struct vdo_completion *completion) if (!vdo_copy_valid_page(vio->data, nonce, pbn, page)) vdo_format_block_map_page(page, nonce, pbn, false); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); /* Release our claim to the load and wake any waiters */ release_page_lock(data_vio, "load"); @@ -1851,10 +1851,9 @@ static void handle_io_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); struct pooled_vio *pooled = container_of(vio, struct pooled_vio, vio); struct data_vio *data_vio = completion->parent; - struct block_map_zone *zone = pooled->context; vio_record_metadata_io_error(vio); - return_vio_to_pool(zone->vio_pool, pooled); + return_vio_to_pool(pooled); abort_load(data_vio, result); } @@ -2499,7 +2498,7 @@ static void finish_cursor(struct cursor *cursor) struct cursors *cursors = cursor->parent; struct vdo_completion *completion = cursors->completion; - return_vio_to_pool(cursors->pool, vdo_forget(cursor->vio)); + return_vio_to_pool(vdo_forget(cursor->vio)); if (--cursors->active_roots > 0) return; diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 8f0a35c63af6..7249b281f99f 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -414,8 +414,7 @@ static void complete_reaping(struct vdo_completion *completion) { struct slab_journal *journal = completion->parent; - return_vio_to_pool(journal->slab->allocator->vio_pool, - vio_as_pooled_vio(as_vio(vdo_forget(completion)))); + return_vio_to_pool(vio_as_pooled_vio(as_vio(completion))); finish_reaping(journal); reap_slab_journal(journal); } @@ -698,7 +697,7 @@ static void complete_write(struct vdo_completion *completion) sequence_number_t committed = get_committing_sequence_number(pooled); list_del_init(&pooled->list_entry); - return_vio_to_pool(journal->slab->allocator->vio_pool, vdo_forget(pooled)); + return_vio_to_pool(pooled); if (result != VDO_SUCCESS) { vio_record_metadata_io_error(as_vio(completion)); @@ -1076,7 +1075,7 @@ static void finish_reference_block_write(struct vdo_completion *completion) /* Release the slab journal lock. */ adjust_slab_journal_block_reference(&slab->journal, block->slab_journal_lock_to_release, -1); - return_vio_to_pool(slab->allocator->vio_pool, pooled); + return_vio_to_pool(pooled); /* * We can't clear the is_writing flag earlier as releasing the slab journal lock may cause @@ -1170,7 +1169,7 @@ static void handle_io_error(struct vdo_completion *completion) struct vdo_slab *slab = ((struct reference_block *) completion->parent)->slab; vio_record_metadata_io_error(vio); - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); slab->active_count--; vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); check_if_slab_drained(slab); @@ -2242,7 +2241,7 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct vdo_slab *slab = block->slab; unpack_reference_block((struct packed_reference_block *) vio->data, block); - return_vio_to_pool(slab->allocator->vio_pool, pooled); + return_vio_to_pool(pooled); slab->active_count--; clear_provisional_references(block); @@ -2429,7 +2428,7 @@ static void finish_loading_journal(struct vdo_completion *completion) initialize_journal_state(journal); } - return_vio_to_pool(slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&slab->state, allocate_counters_if_clean(slab)); } @@ -2449,7 +2448,7 @@ static void handle_load_error(struct vdo_completion *completion) struct vio *vio = as_vio(completion); vio_record_metadata_io_error(vio); - return_vio_to_pool(journal->slab->allocator->vio_pool, vio_as_pooled_vio(vio)); + return_vio_to_pool(vio_as_pooled_vio(vio)); vdo_finish_loading_with_result(&journal->slab->state, result); } diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index e710f3c5a972..4d96989a716d 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -345,6 +345,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } pooled->context = context; + pooled->pool = pool; list_add_tail(&pooled->pool_entry, &pool->available); } @@ -419,12 +420,13 @@ void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter) } /** - * return_vio_to_pool() - Return a vio to the pool - * @pool: The vio pool. + * return_vio_to_pool() - Return a vio to its pool * @vio: The pooled vio to return. */ -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio) +void return_vio_to_pool(struct pooled_vio *vio) { + struct vio_pool *pool = vio->pool; + VDO_ASSERT_LOG_ONLY((pool->thread_id == vdo_get_callback_thread_id()), "vio pool entry returned on same thread as it was acquired"); diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 3490e9f59b04..2e3f878e2074 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -30,6 +30,8 @@ struct pooled_vio { void *context; /* The list entry used by the pool */ struct list_head pool_entry; + /* The pool this vio is allocated from */ + struct vio_pool *pool; }; /** @@ -194,6 +196,6 @@ int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t th void free_vio_pool(struct vio_pool *pool); bool __must_check is_vio_pool_busy(struct vio_pool *pool); void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); -void return_vio_to_pool(struct vio_pool *pool, struct pooled_vio *vio); +void return_vio_to_pool(struct pooled_vio *vio); #endif /* VIO_H */ -- cgit v1.2.3 From 979a0fd396f4924e7ee7ca23186ffeeeefd8b4ca Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Fri, 31 Jan 2025 21:18:04 -0500 Subject: dm vdo vio-pool: support pools with multiple data blocks per vio Support pools with multiple data blocks per vio Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/block-map.c | 2 +- drivers/md/dm-vdo/slab-depot.c | 2 +- drivers/md/dm-vdo/vio.c | 10 ++++++---- drivers/md/dm-vdo/vio.h | 7 ++++--- 4 files changed, 12 insertions(+), 9 deletions(-) diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index bc836f95f8b5..1f7cdd837ff9 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -2745,7 +2745,7 @@ static int __must_check initialize_block_map_zone(struct block_map *map, if (result != VDO_SUCCESS) return result; - result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, + result = make_vio_pool(vdo, BLOCK_MAP_VIO_POOL_SIZE, 1, zone->thread_id, VIO_TYPE_BLOCK_MAP_INTERIOR, VIO_PRIORITY_METADATA, zone, &zone->vio_pool); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 7249b281f99f..aca11271d66e 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -3999,7 +3999,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, return result; vdo_initialize_completion(&allocator->completion, vdo, VDO_BLOCK_ALLOCATOR_COMPLETION); - result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, allocator->thread_id, + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_VIO_POOL_SIZE, 1, allocator->thread_id, VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, allocator, &allocator->vio_pool); if (result != VDO_SUCCESS) diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index 4d96989a716d..f69fb74a238c 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -301,6 +301,7 @@ void vio_record_metadata_io_error(struct vio *vio) * make_vio_pool() - Create a new vio pool. * @vdo: The vdo. * @pool_size: The number of vios in the pool. + * @block_count: The number of 4k blocks per vio. * @thread_id: The ID of the thread using this pool. * @vio_type: The type of vios in the pool. * @priority: The priority with which vios from the pool should be enqueued. @@ -309,13 +310,14 @@ void vio_record_metadata_io_error(struct vio *vio) * * Return: A success or error code. */ -int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, +int make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, thread_id_t thread_id, enum vio_type vio_type, enum vio_priority priority, void *context, struct vio_pool **pool_ptr) { struct vio_pool *pool; char *ptr; int result; + size_t per_vio_size = VDO_BLOCK_SIZE * block_count; result = vdo_allocate_extended(struct vio_pool, pool_size, struct pooled_vio, __func__, &pool); @@ -326,7 +328,7 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, INIT_LIST_HEAD(&pool->available); INIT_LIST_HEAD(&pool->busy); - result = vdo_allocate(pool_size * VDO_BLOCK_SIZE, char, + result = vdo_allocate(pool_size * per_vio_size, char, "VIO pool buffer", &pool->buffer); if (result != VDO_SUCCESS) { free_vio_pool(pool); @@ -334,10 +336,10 @@ int make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, } ptr = pool->buffer; - for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += VDO_BLOCK_SIZE) { + for (pool->size = 0; pool->size < pool_size; pool->size++, ptr += per_vio_size) { struct pooled_vio *pooled = &pool->vios[pool->size]; - result = allocate_vio_components(vdo, vio_type, priority, NULL, 1, ptr, + result = allocate_vio_components(vdo, vio_type, priority, NULL, block_count, ptr, &pooled->vio); if (result != VDO_SUCCESS) { free_vio_pool(pool); diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 2e3f878e2074..8a7e3f72e80b 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -190,9 +190,10 @@ static inline struct pooled_vio *vio_as_pooled_vio(struct vio *vio) struct vio_pool; -int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, thread_id_t thread_id, - enum vio_type vio_type, enum vio_priority priority, - void *context, struct vio_pool **pool_ptr); +int __must_check make_vio_pool(struct vdo *vdo, size_t pool_size, size_t block_count, + thread_id_t thread_id, enum vio_type vio_type, + enum vio_priority priority, void *context, + struct vio_pool **pool_ptr); void free_vio_pool(struct vio_pool *pool); bool __must_check is_vio_pool_busy(struct vio_pool *pool); void acquire_vio_from_pool(struct vio_pool *pool, struct vdo_waiter *waiter); -- cgit v1.2.3 From f979da512553a41a657f2c1198277e84d66f8ce3 Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Fri, 31 Jan 2025 21:18:05 -0500 Subject: dm vdo vio-pool: allow variable-sized metadata vios With larger-sized metadata vio pools, vdo will sometimes need to issue I/O with a smaller size than the allocated size. Since vio_reset_bio is where the bvec array and I/O size are initialized, this reset interface must now specify what I/O size to use. Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/io-submitter.c | 6 ++++-- drivers/md/dm-vdo/io-submitter.h | 18 +++++++++++++++--- drivers/md/dm-vdo/types.h | 3 +++ drivers/md/dm-vdo/vio.c | 36 ++++++++++++++++++++++-------------- drivers/md/dm-vdo/vio.h | 2 ++ 5 files changed, 46 insertions(+), 19 deletions(-) diff --git a/drivers/md/dm-vdo/io-submitter.c b/drivers/md/dm-vdo/io-submitter.c index 421e5436c32c..11d47770b54d 100644 --- a/drivers/md/dm-vdo/io-submitter.c +++ b/drivers/md/dm-vdo/io-submitter.c @@ -327,6 +327,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) * @error_handler: the handler for submission or I/O errors (may be NULL) * @operation: the type of I/O to perform * @data: the buffer to read or write (may be NULL) + * @size: the I/O amount in bytes * * The vio is enqueued on a vdo bio queue so that bio submission (which may block) does not block * other vdo threads. @@ -338,7 +339,7 @@ void vdo_submit_data_vio(struct data_vio *data_vio) */ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data) + blk_opf_t operation, char *data, int size) { int result; struct vdo_completion *completion = &vio->completion; @@ -349,7 +350,8 @@ void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, vdo_reset_completion(completion); completion->error_handler = error_handler; - result = vio_reset_bio(vio, data, callback, operation | REQ_META, physical); + result = vio_reset_bio_with_size(vio, data, size, callback, operation | REQ_META, + physical); if (result != VDO_SUCCESS) { continue_vio(vio, result); return; diff --git a/drivers/md/dm-vdo/io-submitter.h b/drivers/md/dm-vdo/io-submitter.h index 80748699496f..3088f11055fd 100644 --- a/drivers/md/dm-vdo/io-submitter.h +++ b/drivers/md/dm-vdo/io-submitter.h @@ -8,6 +8,7 @@ #include +#include "constants.h" #include "types.h" struct io_submitter; @@ -26,14 +27,25 @@ void vdo_submit_data_vio(struct data_vio *data_vio); void __submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, - blk_opf_t operation, char *data); + blk_opf_t operation, char *data, int size); static inline void vdo_submit_metadata_vio(struct vio *vio, physical_block_number_t physical, bio_end_io_t callback, vdo_action_fn error_handler, blk_opf_t operation) { __submit_metadata_vio(vio, physical, callback, error_handler, - operation, vio->data); + operation, vio->data, vio->block_count * VDO_BLOCK_SIZE); +} + +static inline void vdo_submit_metadata_vio_with_size(struct vio *vio, + physical_block_number_t physical, + bio_end_io_t callback, + vdo_action_fn error_handler, + blk_opf_t operation, + int size) +{ + __submit_metadata_vio(vio, physical, callback, error_handler, + operation, vio->data, size); } static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, @@ -41,7 +53,7 @@ static inline void vdo_submit_flush_vio(struct vio *vio, bio_end_io_t callback, { /* FIXME: Can we just use REQ_OP_FLUSH? */ __submit_metadata_vio(vio, 0, callback, error_handler, - REQ_OP_WRITE | REQ_PREFLUSH, NULL); + REQ_OP_WRITE | REQ_PREFLUSH, NULL, 0); } #endif /* VDO_IO_SUBMITTER_H */ diff --git a/drivers/md/dm-vdo/types.h b/drivers/md/dm-vdo/types.h index dbe892b10f26..cdf36e7d7702 100644 --- a/drivers/md/dm-vdo/types.h +++ b/drivers/md/dm-vdo/types.h @@ -376,6 +376,9 @@ struct vio { /* The size of this vio in blocks */ unsigned int block_count; + /* The amount of data to be read or written, in bytes */ + unsigned int io_size; + /* The data being read or written. */ char *data; diff --git a/drivers/md/dm-vdo/vio.c b/drivers/md/dm-vdo/vio.c index f69fb74a238c..e7f4153e55e3 100644 --- a/drivers/md/dm-vdo/vio.c +++ b/drivers/md/dm-vdo/vio.c @@ -188,14 +188,23 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb /* * Prepares the bio to perform IO with the specified buffer. May only be used on a VDO-allocated - * bio, as it assumes the bio wraps a 4k buffer that is 4k aligned, but there does not have to be a - * vio associated with the bio. + * bio, as it assumes the bio wraps a 4k-multiple buffer that is 4k aligned, but there does not + * have to be a vio associated with the bio. */ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn) { - int bvec_count, offset, len, i; + return vio_reset_bio_with_size(vio, data, vio->block_count * VDO_BLOCK_SIZE, + callback, bi_opf, pbn); +} + +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn) +{ + int bvec_count, offset, i; struct bio *bio = vio->bio; + int vio_size = vio->block_count * VDO_BLOCK_SIZE; + int remaining; bio_reset(bio, bio->bi_bdev, bi_opf); vdo_set_bio_properties(bio, vio, callback, bi_opf, pbn); @@ -205,22 +214,21 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, bio->bi_ioprio = 0; bio->bi_io_vec = bio->bi_inline_vecs; bio->bi_max_vecs = vio->block_count + 1; - len = VDO_BLOCK_SIZE * vio->block_count; + if (VDO_ASSERT(size <= vio_size, "specified size %d is not greater than allocated %d", + size, vio_size) != VDO_SUCCESS) + size = vio_size; + vio->io_size = size; offset = offset_in_page(data); - bvec_count = DIV_ROUND_UP(offset + len, PAGE_SIZE); + bvec_count = DIV_ROUND_UP(offset + size, PAGE_SIZE); + remaining = size; - /* - * If we knew that data was always on one page, or contiguous pages, we wouldn't need the - * loop. But if we're using vmalloc, it's not impossible that the data is in different - * pages that can't be merged in bio_add_page... - */ - for (i = 0; (i < bvec_count) && (len > 0); i++) { + for (i = 0; (i < bvec_count) && (remaining > 0); i++) { struct page *page; int bytes_added; int bytes = PAGE_SIZE - offset; - if (bytes > len) - bytes = len; + if (bytes > remaining) + bytes = remaining; page = is_vmalloc_addr(data) ? vmalloc_to_page(data) : virt_to_page(data); bytes_added = bio_add_page(bio, page, bytes, offset); @@ -232,7 +240,7 @@ int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, } data += bytes; - len -= bytes; + remaining -= bytes; offset = 0; } diff --git a/drivers/md/dm-vdo/vio.h b/drivers/md/dm-vdo/vio.h index 8a7e3f72e80b..4bfcb21901f1 100644 --- a/drivers/md/dm-vdo/vio.h +++ b/drivers/md/dm-vdo/vio.h @@ -125,6 +125,8 @@ void vdo_set_bio_properties(struct bio *bio, struct vio *vio, bio_end_io_t callb int vio_reset_bio(struct vio *vio, char *data, bio_end_io_t callback, blk_opf_t bi_opf, physical_block_number_t pbn); +int vio_reset_bio_with_size(struct vio *vio, char *data, int size, bio_end_io_t callback, + blk_opf_t bi_opf, physical_block_number_t pbn); void update_vio_error_stats(struct vio *vio, const char *format, ...) __printf(2, 3); -- cgit v1.2.3 From 0ce46f4f751bc15375aea501a991ec931278b415 Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Fri, 31 Jan 2025 21:18:06 -0500 Subject: dm vdo slab-depot: read refcount blocks in large chunks at load time At startup, vdo loads all the reference count data before the device reports that it is ready. Using a pool of large metadata vios can improve the startup speed of vdo. The pool of large vios is released after the device is ready. During normal operation, reference counts are updated 4kB at a time, as before. Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/slab-depot.c | 63 +++++++++++++++++++++++++++++++----------- drivers/md/dm-vdo/slab-depot.h | 13 ++++++++- 2 files changed, 59 insertions(+), 17 deletions(-) diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index aca11271d66e..c4d3bbdf5b69 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -1170,7 +1170,7 @@ static void handle_io_error(struct vdo_completion *completion) vio_record_metadata_io_error(vio); return_vio_to_pool(vio_as_pooled_vio(vio)); - slab->active_count--; + slab->active_count -= vio->io_size / VDO_BLOCK_SIZE; vdo_enter_read_only_mode(slab->allocator->depot->vdo, result); check_if_slab_drained(slab); } @@ -2239,13 +2239,20 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct pooled_vio *pooled = vio_as_pooled_vio(vio); struct reference_block *block = completion->parent; struct vdo_slab *slab = block->slab; + unsigned int block_count = vio->io_size / VDO_BLOCK_SIZE; + unsigned int i; + char *data = vio->data; + + for (i = 0; i < block_count; i++, block++, data += VDO_BLOCK_SIZE) { + struct packed_reference_block *packed = (struct packed_reference_block *) data; - unpack_reference_block((struct packed_reference_block *) vio->data, block); + unpack_reference_block(packed, block); + clear_provisional_references(block); + slab->free_blocks -= block->allocated_count; + } return_vio_to_pool(pooled); - slab->active_count--; - clear_provisional_references(block); + slab->active_count -= block_count; - slab->free_blocks -= block->allocated_count; check_if_slab_drained(slab); } @@ -2259,23 +2266,25 @@ static void load_reference_block_endio(struct bio *bio) } /** - * load_reference_block() - After a block waiter has gotten a VIO from the VIO pool, load the - * block. - * @waiter: The waiter of the block to load. + * load_reference_block_group() - After a block waiter has gotten a VIO from the VIO pool, load + * a set of blocks. + * @waiter: The waiter of the first block to load. * @context: The VIO returned by the pool. */ -static void load_reference_block(struct vdo_waiter *waiter, void *context) +static void load_reference_block_group(struct vdo_waiter *waiter, void *context) { struct pooled_vio *pooled = context; struct vio *vio = &pooled->vio; struct reference_block *block = container_of(waiter, struct reference_block, waiter); - size_t block_offset = (block - block->slab->reference_blocks); + u32 block_offset = block - block->slab->reference_blocks; + u32 max_block_count = block->slab->reference_block_count - block_offset; + u32 block_count = min_t(int, vio->block_count, max_block_count); vio->completion.parent = block; - vdo_submit_metadata_vio(vio, block->slab->ref_counts_origin + block_offset, - load_reference_block_endio, handle_io_error, - REQ_OP_READ); + vdo_submit_metadata_vio_with_size(vio, block->slab->ref_counts_origin + block_offset, + load_reference_block_endio, handle_io_error, + REQ_OP_READ, block_count * VDO_BLOCK_SIZE); } /** @@ -2285,14 +2294,21 @@ static void load_reference_block(struct vdo_waiter *waiter, void *context) static void load_reference_blocks(struct vdo_slab *slab) { block_count_t i; + u64 blocks_per_vio = slab->allocator->refcount_blocks_per_big_vio; + struct vio_pool *pool = slab->allocator->refcount_big_vio_pool; + + if (!pool) { + pool = slab->allocator->vio_pool; + blocks_per_vio = 1; + } slab->free_blocks = slab->block_count; slab->active_count = slab->reference_block_count; - for (i = 0; i < slab->reference_block_count; i++) { + for (i = 0; i < slab->reference_block_count; i += blocks_per_vio) { struct vdo_waiter *waiter = &slab->reference_blocks[i].waiter; - waiter->callback = load_reference_block; - acquire_vio_from_pool(slab->allocator->vio_pool, waiter); + waiter->callback = load_reference_block_group; + acquire_vio_from_pool(pool, waiter); } } @@ -2699,6 +2715,7 @@ static void finish_scrubbing(struct slab_scrubber *scrubber, int result) vdo_log_info("VDO commencing normal operation"); else if (prior_state == VDO_RECOVERING) vdo_log_info("Exiting recovery mode"); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); } /* @@ -3982,6 +3999,7 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, struct vdo *vdo = depot->vdo; block_count_t max_free_blocks = depot->slab_config.data_blocks; unsigned int max_priority = (2 + ilog2(max_free_blocks)); + u32 reference_block_count, refcount_reads_needed, refcount_blocks_per_vio; *allocator = (struct block_allocator) { .depot = depot, @@ -4005,6 +4023,18 @@ static int __must_check initialize_block_allocator(struct slab_depot *depot, if (result != VDO_SUCCESS) return result; + /* Initialize the refcount-reading vio pool. */ + reference_block_count = vdo_get_saved_reference_count_size(depot->slab_config.slab_blocks); + refcount_reads_needed = DIV_ROUND_UP(reference_block_count, MAX_BLOCKS_PER_VIO); + refcount_blocks_per_vio = DIV_ROUND_UP(reference_block_count, refcount_reads_needed); + allocator->refcount_blocks_per_big_vio = refcount_blocks_per_vio; + result = make_vio_pool(vdo, BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE, + allocator->refcount_blocks_per_big_vio, allocator->thread_id, + VIO_TYPE_SLAB_JOURNAL, VIO_PRIORITY_METADATA, + NULL, &allocator->refcount_big_vio_pool); + if (result != VDO_SUCCESS) + return result; + result = initialize_slab_scrubber(allocator); if (result != VDO_SUCCESS) return result; @@ -4222,6 +4252,7 @@ void vdo_free_slab_depot(struct slab_depot *depot) uninitialize_allocator_summary(allocator); uninitialize_scrubber_vio(&allocator->scrubber); free_vio_pool(vdo_forget(allocator->vio_pool)); + free_vio_pool(vdo_forget(allocator->refcount_big_vio_pool)); vdo_free_priority_table(vdo_forget(allocator->prioritized_slabs)); } diff --git a/drivers/md/dm-vdo/slab-depot.h b/drivers/md/dm-vdo/slab-depot.h index f234853501ca..fadc0c9d4dc4 100644 --- a/drivers/md/dm-vdo/slab-depot.h +++ b/drivers/md/dm-vdo/slab-depot.h @@ -45,6 +45,13 @@ enum { /* The number of vios in the vio pool is proportional to the throughput of the VDO. */ BLOCK_ALLOCATOR_VIO_POOL_SIZE = 128, + + /* + * The number of vios in the vio pool used for loading reference count data. A slab's + * refcounts is capped at ~8MB, and we process one at a time in a zone, so 9 should be + * plenty. + */ + BLOCK_ALLOCATOR_REFCOUNT_VIO_POOL_SIZE = 9, }; /* @@ -248,7 +255,7 @@ struct vdo_slab { /* A list of the dirty blocks waiting to be written out */ struct vdo_wait_queue dirty_blocks; - /* The number of blocks which are currently writing */ + /* The number of blocks which are currently reading or writing */ size_t active_count; /* A waiter object for updating the slab summary */ @@ -425,6 +432,10 @@ struct block_allocator { /* The vio pool for reading and writing block allocator metadata */ struct vio_pool *vio_pool; + /* The vio pool for large initial reads of ref count areas */ + struct vio_pool *refcount_big_vio_pool; + /* How many ref count blocks are read per vio at initial load */ + u32 refcount_blocks_per_big_vio; /* The dm_kcopyd client for erasing slab journals */ struct dm_kcopyd_client *eraser; /* Iterator over the slabs to be erased */ -- cgit v1.2.3 From a8b8a126c8573b392432b0d1b23314bda59acbfc Mon Sep 17 00:00:00 2001 From: Ed Tsai Date: Sun, 16 Feb 2025 22:42:21 +0800 Subject: dm: Enable inline crypto passthrough for striped target Added DM_TARGET_PASSES_CRYPTO feature to the striped target to utilize the hardware encryption of the underlying storage devices, preventing fallback to the crypto API. Signed-off-by: Ed Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-stripe.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3786ac67cefe..a1b7535c508a 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c @@ -467,7 +467,7 @@ static struct target_type stripe_target = { .name = "striped", .version = {1, 7, 0}, .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_NOWAIT | - DM_TARGET_ATOMIC_WRITES, + DM_TARGET_ATOMIC_WRITES | DM_TARGET_PASSES_CRYPTO, .module = THIS_MODULE, .ctr = stripe_ctr, .dtr = stripe_dtr, -- cgit v1.2.3 From 00204ae3d6712ee053353920e3ce2b00c35ef75b Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 10 Feb 2025 16:14:22 +0100 Subject: dm-integrity: set ti->error on memory allocation failure The dm-integrity target didn't set the error string when memory allocation failed. This patch fixes it. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-integrity.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index ee9f7cecd78e..f41e64f1dab2 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -5081,16 +5081,19 @@ try_smaller_buffer: ic->recalc_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->recalc_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->may_write_bitmap = dm_integrity_alloc_page_list(n_bitmap_pages); if (!ic->may_write_bitmap) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } ic->bbs = kvmalloc_array(ic->n_bitmap_blocks, sizeof(struct bitmap_block_status), GFP_KERNEL); if (!ic->bbs) { + ti->error = "Could not allocate memory for bitmap"; r = -ENOMEM; goto bad; } -- cgit v1.2.3 From 3be1f253935b1f1f3a445451ae8358781e1f62f6 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 10 Feb 2025 16:15:22 +0100 Subject: dm-bufio: remove unused return value The return value of the function "forget_buffer" is not tested, so we can remove it. Signed-off-by: Mikulas Patocka --- drivers/md/dm-bufio.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index aab8240429b0..9c8ed65cd87e 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c @@ -2234,7 +2234,7 @@ int dm_bufio_issue_discard(struct dm_bufio_client *c, sector_t block, sector_t c } EXPORT_SYMBOL_GPL(dm_bufio_issue_discard); -static bool forget_buffer(struct dm_bufio_client *c, sector_t block) +static void forget_buffer(struct dm_bufio_client *c, sector_t block) { struct dm_buffer *b; @@ -2249,8 +2249,6 @@ static bool forget_buffer(struct dm_bufio_client *c, sector_t block) cache_put_and_wake(c, b); } } - - return b ? true : false; } /* -- cgit v1.2.3 From 51ba14fb368e5fadadcca803ec64a51f3edbbdc1 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Mon, 17 Feb 2025 22:36:02 +0100 Subject: dm-verity: do forward error correction on metadata I/O errors Do forward error correction if metadata I/O fails. Signed-off-by: Mikulas Patocka Reviewed-by: Sami Tolvanen --- drivers/md/dm-verity-target.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index e86c1431b108..fc5ae123670b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -311,7 +311,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { data = dm_bufio_get(v->bufio, hash_block, &buf); - if (data == NULL) { + if (IS_ERR_OR_NULL(data)) { /* * In tasklet and the hash was not in the bufio cache. * Return early and resume execution from a work-queue @@ -324,8 +324,24 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, &buf, bio->bi_ioprio); } - if (IS_ERR(data)) - return PTR_ERR(data); + if (IS_ERR(data)) { + if (skip_unverified) + return 1; + r = PTR_ERR(data); + data = dm_bufio_new(v->bufio, hash_block, &buf); + if (IS_ERR(data)) + return r; + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, + hash_block, data) == 0) { + aux = dm_bufio_get_aux_data(buf); + aux->hash_verified = 1; + goto release_ok; + } else { + dm_bufio_release(buf); + dm_bufio_forget(v->bufio, hash_block); + return r; + } + } aux = dm_bufio_get_aux_data(buf); @@ -366,6 +382,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, } } +release_ok: data += offset; memcpy(want_digest, data, v->digest_size); r = 0; @@ -1761,7 +1778,7 @@ static struct target_type verity_target = { .name = "verity", /* Note: the LSMs depend on the singleton and immutable features */ .features = DM_TARGET_SINGLETON | DM_TARGET_IMMUTABLE, - .version = {1, 10, 0}, + .version = {1, 11, 0}, .module = THIS_MODULE, .ctr = verity_ctr, .dtr = verity_dtr, -- cgit v1.2.3 From ff3f7115f4ff365ce759cd4e67e6653e9acfca86 Mon Sep 17 00:00:00 2001 From: Sweet Tea Dorminy Date: Wed, 19 Feb 2025 17:55:39 -0500 Subject: dm vdo: remove remaining ring references Lists are the new rings, so update all remaining references to rings to talk about lists. Signed-off-by: Sweet Tea Dorminy Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/block-map.c | 2 +- drivers/md/dm-vdo/dedupe.c | 20 ++++++++++---------- drivers/md/dm-vdo/packer.h | 2 +- drivers/md/dm-vdo/priority-table.c | 2 +- drivers/md/dm-vdo/recovery-journal.h | 6 +++--- drivers/md/dm-vdo/slab-depot.c | 10 +++++----- drivers/md/dm-vdo/wait-queue.c | 2 +- 7 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-vdo/block-map.c b/drivers/md/dm-vdo/block-map.c index 1f7cdd837ff9..baf683cabb1b 100644 --- a/drivers/md/dm-vdo/block-map.c +++ b/drivers/md/dm-vdo/block-map.c @@ -451,7 +451,7 @@ static struct page_info * __must_check find_page(struct vdo_page_cache *cache, * select_lru_page() - Determine which page is least recently used. * * Picks the least recently used from among the non-busy entries at the front of each of the lru - * ring. Since whenever we mark a page busy we also put it to the end of the ring it is unlikely + * list. Since whenever we mark a page busy we also put it to the end of the list it is unlikely * that the entries at the front are busy unless the queue is very short, but not impossible. * * Return: A pointer to the info structure for a relevant page, or NULL if no such page can be diff --git a/drivers/md/dm-vdo/dedupe.c b/drivers/md/dm-vdo/dedupe.c index b6f8e2dc7729..00fff21b8058 100644 --- a/drivers/md/dm-vdo/dedupe.c +++ b/drivers/md/dm-vdo/dedupe.c @@ -226,7 +226,7 @@ struct hash_lock { * A list containing the data VIOs sharing this lock, all having the same record name and * data block contents, linked by their hash_lock_node fields. */ - struct list_head duplicate_ring; + struct list_head duplicate_vios; /* The number of data_vios sharing this lock instance */ data_vio_count_t reference_count; @@ -343,7 +343,7 @@ static void return_hash_lock_to_pool(struct hash_zone *zone, struct hash_lock *l { memset(lock, 0, sizeof(*lock)); INIT_LIST_HEAD(&lock->pool_node); - INIT_LIST_HEAD(&lock->duplicate_ring); + INIT_LIST_HEAD(&lock->duplicate_vios); vdo_waitq_init(&lock->waiters); list_add_tail(&lock->pool_node, &zone->lock_pool); } @@ -441,7 +441,7 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) VDO_ASSERT_LOG_ONLY(data_vio->hash_zone != NULL, "must have a hash zone when holding a hash lock"); VDO_ASSERT_LOG_ONLY(!list_empty(&data_vio->hash_lock_entry), - "must be on a hash lock ring when holding a hash lock"); + "must be on a hash lock list when holding a hash lock"); VDO_ASSERT_LOG_ONLY(old_lock->reference_count > 0, "hash lock reference must be counted"); @@ -464,10 +464,10 @@ static void set_hash_lock(struct data_vio *data_vio, struct hash_lock *new_lock) if (new_lock != NULL) { /* - * Keep all data_vios sharing the lock on a ring since they can complete in any + * Keep all data_vios sharing the lock on a list since they can complete in any * order and we'll always need a pointer to one to compare data. */ - list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_ring); + list_move_tail(&data_vio->hash_lock_entry, &new_lock->duplicate_vios); new_lock->reference_count += 1; if (new_lock->max_references < new_lock->reference_count) new_lock->max_references = new_lock->reference_count; @@ -1789,10 +1789,10 @@ static bool is_hash_collision(struct hash_lock *lock, struct data_vio *candidate struct hash_zone *zone; bool collides; - if (list_empty(&lock->duplicate_ring)) + if (list_empty(&lock->duplicate_vios)) return false; - lock_holder = list_first_entry(&lock->duplicate_ring, struct data_vio, + lock_holder = list_first_entry(&lock->duplicate_vios, struct data_vio, hash_lock_entry); zone = candidate->hash_zone; collides = !blocks_equal(lock_holder->vio.data, candidate->vio.data); @@ -1815,7 +1815,7 @@ static inline int assert_hash_lock_preconditions(const struct data_vio *data_vio return result; result = VDO_ASSERT(list_empty(&data_vio->hash_lock_entry), - "must not already be a member of a hash lock ring"); + "must not already be a member of a hash lock list"); if (result != VDO_SUCCESS) return result; @@ -1942,8 +1942,8 @@ void vdo_release_hash_lock(struct data_vio *data_vio) "returned hash lock must not be in use with state %s", get_hash_lock_state_name(lock->state)); VDO_ASSERT_LOG_ONLY(list_empty(&lock->pool_node), - "hash lock returned to zone must not be in a pool ring"); - VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_ring), + "hash lock returned to zone must not be in a pool list"); + VDO_ASSERT_LOG_ONLY(list_empty(&lock->duplicate_vios), "hash lock returned to zone must not reference DataVIOs"); return_hash_lock_to_pool(zone, lock); diff --git a/drivers/md/dm-vdo/packer.h b/drivers/md/dm-vdo/packer.h index 0f3be44710b5..8c8d6892582d 100644 --- a/drivers/md/dm-vdo/packer.h +++ b/drivers/md/dm-vdo/packer.h @@ -46,7 +46,7 @@ struct compressed_block { /* * Each packer_bin holds an incomplete batch of data_vios that only partially fill a compressed - * block. The bins are kept in a ring sorted by the amount of unused space so the first bin with + * block. The bins are kept in a list sorted by the amount of unused space so the first bin with * enough space to hold a newly-compressed data_vio can easily be found. When the bin fills up or * is flushed, the first uncanceled data_vio in the bin is selected to be the agent for that bin. * Upon entering the packer, each data_vio already has its compressed data in the first slot of the diff --git a/drivers/md/dm-vdo/priority-table.c b/drivers/md/dm-vdo/priority-table.c index 42d3d8d0e4b5..9bae8256ba4e 100644 --- a/drivers/md/dm-vdo/priority-table.c +++ b/drivers/md/dm-vdo/priority-table.c @@ -199,7 +199,7 @@ void vdo_priority_table_remove(struct priority_table *table, struct list_head *e /* * Remove the entry from the bucket list, remembering a pointer to another entry in the - * ring. + * list. */ next_entry = entry->next; list_del_init(entry); diff --git a/drivers/md/dm-vdo/recovery-journal.h b/drivers/md/dm-vdo/recovery-journal.h index 899071173015..25e7ec6d19f6 100644 --- a/drivers/md/dm-vdo/recovery-journal.h +++ b/drivers/md/dm-vdo/recovery-journal.h @@ -43,9 +43,9 @@ * has a vio which is used to commit that block to disk. The vio's data is the on-disk * representation of the journal block. In addition each in-memory block has a buffer which is used * to accumulate entries while a partial commit of the block is in progress. In-memory blocks are - * kept on two rings. Free blocks live on the 'free_tail_blocks' ring. When a block becomes active - * (see below) it is moved to the 'active_tail_blocks' ring. When a block is fully committed, it is - * moved back to the 'free_tail_blocks' ring. + * kept on two lists. Free blocks live on the 'free_tail_blocks' list. When a block becomes active + * (see below) it is moved to the 'active_tail_blocks' list. When a block is fully committed, it is + * moved back to the 'free_tail_blocks' list. * * When entries are added to the journal, they are added to the active in-memory block, as * indicated by the 'active_block' field. If the caller wishes to wait for the entry to be diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index c4d3bbdf5b69..1ab01b6f263a 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -139,7 +139,7 @@ static bool is_slab_journal_blank(const struct vdo_slab *slab) } /** - * mark_slab_journal_dirty() - Put a slab journal on the dirty ring of its allocator in the correct + * mark_slab_journal_dirty() - Put a slab journal on the dirty list of its allocator in the correct * order. * @journal: The journal to be marked dirty. * @lock: The recovery journal lock held by the slab journal. @@ -821,7 +821,7 @@ static void commit_tail(struct slab_journal *journal) /* * Since we are about to commit the tail block, this journal no longer needs to be on the - * ring of journals which the recovery journal might ask to commit. + * list of journals which the recovery journal might ask to commit. */ mark_slab_journal_clean(journal); @@ -1371,7 +1371,7 @@ static unsigned int calculate_slab_priority(struct vdo_slab *slab) static void prioritize_slab(struct vdo_slab *slab) { VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a slab must not already be on a ring when prioritizing"); + "a slab must not already be on a list when prioritizing"); slab->priority = calculate_slab_priority(slab); vdo_priority_table_enqueue(slab->allocator->prioritized_slabs, slab->priority, &slab->allocq_entry); @@ -2562,7 +2562,7 @@ static void queue_slab(struct vdo_slab *slab) int result; VDO_ASSERT_LOG_ONLY(list_empty(&slab->allocq_entry), - "a requeued slab must not already be on a ring"); + "a requeued slab must not already be on a list"); if (vdo_is_read_only(allocator->depot->vdo)) return; @@ -3297,7 +3297,7 @@ int vdo_release_block_reference(struct block_allocator *allocator, * This is a min_heap callback function orders slab_status structures using the 'is_clean' field as * the primary key and the 'emptiness' field as the secondary key. * - * Slabs need to be pushed onto the rings in the same order they are to be popped off. Popping + * Slabs need to be pushed onto the lists in the same order they are to be popped off. Popping * should always get the most empty first, so pushing should be from most empty to least empty. * Thus, the ordering is reversed from the usual sense since min_heap returns smaller elements * before larger ones. diff --git a/drivers/md/dm-vdo/wait-queue.c b/drivers/md/dm-vdo/wait-queue.c index 6e1e739277ef..f81ed0cee2bf 100644 --- a/drivers/md/dm-vdo/wait-queue.c +++ b/drivers/md/dm-vdo/wait-queue.c @@ -34,7 +34,7 @@ void vdo_waitq_enqueue_waiter(struct vdo_wait_queue *waitq, struct vdo_waiter *w waitq->last_waiter->next_waiter = waiter; } - /* In both cases, the waiter we added to the ring becomes the last waiter. */ + /* In both cases, the waiter we added to the list becomes the last waiter. */ waitq->last_waiter = waiter; waitq->length += 1; } -- cgit v1.2.3 From dc8f646cd870671ccebf896b35433362252b850d Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Wed, 19 Feb 2025 18:00:19 -0500 Subject: dm vdo: rework processing of loaded refcount byte arrays Clear provisional refcount values and count free/allocated blocks in one integrated loop. Process 8 aligned bytes at a time instead of every byte individually. On an Intel i7-11850H this reduces the CPU time needed to process a loaded refcount block by a factor of about 5-6. On a large system the refcount loading may be the largest factor in device startup time. Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/slab-depot.c | 105 ++++++++++++++++++++++++++++++++--------- 1 file changed, 83 insertions(+), 22 deletions(-) diff --git a/drivers/md/dm-vdo/slab-depot.c b/drivers/md/dm-vdo/slab-depot.c index 1ab01b6f263a..f3d80ff7bef5 100644 --- a/drivers/md/dm-vdo/slab-depot.c +++ b/drivers/md/dm-vdo/slab-depot.c @@ -2164,28 +2164,95 @@ static void dirty_all_reference_blocks(struct vdo_slab *slab) dirty_block(&slab->reference_blocks[i]); } +static inline bool journal_points_equal(struct journal_point first, + struct journal_point second) +{ + return ((first.sequence_number == second.sequence_number) && + (first.entry_count == second.entry_count)); +} + /** - * clear_provisional_references() - Clear the provisional reference counts from a reference block. - * @block: The block to clear. + * match_bytes() - Check an 8-byte word for bytes matching the value specified + * @input: A word to examine the bytes of + * @match: The byte value sought + * + * Return: 1 in each byte when the corresponding input byte matched, 0 otherwise */ -static void clear_provisional_references(struct reference_block *block) +static inline u64 match_bytes(u64 input, u8 match) { - vdo_refcount_t *counters = get_reference_counters_for_block(block); - block_count_t j; + u64 temp = input ^ (match * 0x0101010101010101ULL); + /* top bit of each byte is set iff top bit of temp byte is clear; rest are 0 */ + u64 test_top_bits = ~temp & 0x8080808080808080ULL; + /* top bit of each byte is set iff low 7 bits of temp byte are clear; rest are useless */ + u64 test_low_bits = 0x8080808080808080ULL - (temp & 0x7f7f7f7f7f7f7f7fULL); + /* return 1 when both tests indicate temp byte is 0 */ + return (test_top_bits & test_low_bits) >> 7; +} + +/** + * count_valid_references() - Process a newly loaded refcount array + * @counters: the array of counters from a metadata block + * + * Scan a 8-byte-aligned array of counters, fixing up any "provisional" values that weren't + * cleaned up at shutdown, changing them internally to "empty". + * + * Return: the number of blocks that are referenced (counters not "empty") + */ +static unsigned int count_valid_references(vdo_refcount_t *counters) +{ + u64 *words = (u64 *)counters; + /* It's easier to count occurrences of a specific byte than its absences. */ + unsigned int empty_count = 0; + /* For speed, we process 8 bytes at once. */ + unsigned int words_left = COUNTS_PER_BLOCK / sizeof(u64); + + /* + * Sanity check assumptions used for optimizing this code: Counters are bytes. The counter + * array is a multiple of the word size. + */ + BUILD_BUG_ON(sizeof(vdo_refcount_t) != 1); + BUILD_BUG_ON((COUNTS_PER_BLOCK % sizeof(u64)) != 0); - for (j = 0; j < COUNTS_PER_BLOCK; j++) { - if (counters[j] == PROVISIONAL_REFERENCE_COUNT) { - counters[j] = EMPTY_REFERENCE_COUNT; - block->allocated_count--; + while (words_left > 0) { + /* + * This is used effectively as 8 byte-size counters. Byte 0 counts how many words + * had the target value found in byte 0, etc. We just have to avoid overflow. + */ + u64 split_count = 0; + /* + * The counter "% 255" trick used below to fold split_count into empty_count + * imposes a limit of 254 bytes examined each iteration of the outer loop. We + * process a word at a time, so that limit gets rounded down to 31 u64 words. + */ + const unsigned int max_words_per_iteration = 254 / sizeof(u64); + unsigned int iter_words_left = min_t(unsigned int, words_left, + max_words_per_iteration); + + words_left -= iter_words_left; + + while (iter_words_left--) { + u64 word = *words; + u64 temp; + + /* First, if we have any provisional refcount values, clear them. */ + temp = match_bytes(word, PROVISIONAL_REFERENCE_COUNT); + if (temp) { + /* + * 'temp' has 0x01 bytes where 'word' has PROVISIONAL; this xor + * will alter just those bytes, changing PROVISIONAL to EMPTY. + */ + word ^= temp * (PROVISIONAL_REFERENCE_COUNT ^ EMPTY_REFERENCE_COUNT); + *words = word; + } + + /* Now count the EMPTY_REFERENCE_COUNT bytes, updating the 8 counters. */ + split_count += match_bytes(word, EMPTY_REFERENCE_COUNT); + words++; } + empty_count += split_count % 255; } -} -static inline bool journal_points_equal(struct journal_point first, - struct journal_point second) -{ - return ((first.sequence_number == second.sequence_number) && - (first.entry_count == second.entry_count)); + return COUNTS_PER_BLOCK - empty_count; } /** @@ -2196,7 +2263,6 @@ static inline bool journal_points_equal(struct journal_point first, static void unpack_reference_block(struct packed_reference_block *packed, struct reference_block *block) { - block_count_t index; sector_count_t i; struct vdo_slab *slab = block->slab; vdo_refcount_t *counters = get_reference_counters_for_block(block); @@ -2222,11 +2288,7 @@ static void unpack_reference_block(struct packed_reference_block *packed, } } - block->allocated_count = 0; - for (index = 0; index < COUNTS_PER_BLOCK; index++) { - if (counters[index] != EMPTY_REFERENCE_COUNT) - block->allocated_count++; - } + block->allocated_count = count_valid_references(counters); } /** @@ -2247,7 +2309,6 @@ static void finish_reference_block_load(struct vdo_completion *completion) struct packed_reference_block *packed = (struct packed_reference_block *) data; unpack_reference_block(packed, block); - clear_provisional_references(block); slab->free_blocks -= block->allocated_count; } return_vio_to_pool(pooled); -- cgit v1.2.3 From 88f7f56d16f568f19e1a695af34a7f4a6ce537a6 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Thu, 20 Feb 2025 19:20:14 +0800 Subject: dm: fix unconditional IO throttle caused by REQ_PREFLUSH When a bio with REQ_PREFLUSH is submitted to dm, __send_empty_flush() generates a flush_bio with REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, which causes the flush_bio to be throttled by wbt_wait(). An example from v5.4, similar problem also exists in upstream: crash> bt 2091206 PID: 2091206 TASK: ffff2050df92a300 CPU: 109 COMMAND: "kworker/u260:0" #0 [ffff800084a2f7f0] __switch_to at ffff80004008aeb8 #1 [ffff800084a2f820] __schedule at ffff800040bfa0c4 #2 [ffff800084a2f880] schedule at ffff800040bfa4b4 #3 [ffff800084a2f8a0] io_schedule at ffff800040bfa9c4 #4 [ffff800084a2f8c0] rq_qos_wait at ffff8000405925bc #5 [ffff800084a2f940] wbt_wait at ffff8000405bb3a0 #6 [ffff800084a2f9a0] __rq_qos_throttle at ffff800040592254 #7 [ffff800084a2f9c0] blk_mq_make_request at ffff80004057cf38 #8 [ffff800084a2fa60] generic_make_request at ffff800040570138 #9 [ffff800084a2fae0] submit_bio at ffff8000405703b4 #10 [ffff800084a2fb50] xlog_write_iclog at ffff800001280834 [xfs] #11 [ffff800084a2fbb0] xlog_sync at ffff800001280c3c [xfs] #12 [ffff800084a2fbf0] xlog_state_release_iclog at ffff800001280df4 [xfs] #13 [ffff800084a2fc10] xlog_write at ffff80000128203c [xfs] #14 [ffff800084a2fcd0] xlog_cil_push at ffff8000012846dc [xfs] #15 [ffff800084a2fda0] xlog_cil_push_work at ffff800001284a2c [xfs] #16 [ffff800084a2fdb0] process_one_work at ffff800040111d08 #17 [ffff800084a2fe00] worker_thread at ffff8000401121cc #18 [ffff800084a2fe70] kthread at ffff800040118de4 After commit 2def2845cc33 ("xfs: don't allow log IO to be throttled"), the metadata submitted by xlog_write_iclog() should not be throttled. But due to the existence of the dm layer, throttling flush_bio indirectly causes the metadata bio to be throttled. Fix this by conditionally adding REQ_IDLE to flush_bio.bi_opf, which makes wbt_should_throttle() return false to avoid wbt_wait(). Signed-off-by: Jinliang Zheng Reviewed-by: Tianxiang Peng Reviewed-by: Hao Peng Signed-off-by: Mikulas Patocka --- drivers/md/dm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 4d1e42891d24..5ab7574c0c76 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1540,14 +1540,18 @@ static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == + (REQ_IDLE | REQ_SYNC)) + opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; -- cgit v1.2.3 From e678900df2640b30c341c1c4121e859e7d3f267b Mon Sep 17 00:00:00 2001 From: Ken Raeburn Date: Mon, 24 Feb 2025 16:36:03 -0500 Subject: dm vdo indexer: reorder uds_request to reduce padding Reorder fields and make uds_request_type and uds_zone_message packed, to squeeze out some space. Use struct_group so the request reset code no longer needs to care about field order. On x86_64 this reduces the struct size from 144 to 120, which saves 48 kB (about 12%) per VDO hash zone. Signed-off-by: Ken Raeburn Signed-off-by: Matthew Sakai Signed-off-by: Mikulas Patocka --- drivers/md/dm-vdo/indexer/index-session.c | 6 +--- drivers/md/dm-vdo/indexer/indexer.h | 53 +++++++++++++++---------------- 2 files changed, 27 insertions(+), 32 deletions(-) diff --git a/drivers/md/dm-vdo/indexer/index-session.c b/drivers/md/dm-vdo/indexer/index-session.c index aee0914d604a..aa575a24e0b2 100644 --- a/drivers/md/dm-vdo/indexer/index-session.c +++ b/drivers/md/dm-vdo/indexer/index-session.c @@ -100,7 +100,6 @@ static int get_index_session(struct uds_index_session *index_session) int uds_launch_request(struct uds_request *request) { - size_t internal_size; int result; if (request->callback == NULL) { @@ -121,10 +120,7 @@ int uds_launch_request(struct uds_request *request) } /* Reset all internal fields before processing. */ - internal_size = - sizeof(struct uds_request) - offsetof(struct uds_request, zone_number); - // FIXME should be using struct_group for this instead - memset((char *) request + sizeof(*request) - internal_size, 0, internal_size); + memset(&request->internal, 0, sizeof(request->internal)); result = get_index_session(request->session); if (result != UDS_SUCCESS) diff --git a/drivers/md/dm-vdo/indexer/indexer.h b/drivers/md/dm-vdo/indexer/indexer.h index 183a94eb7e92..7c1fc4577f5b 100644 --- a/drivers/md/dm-vdo/indexer/indexer.h +++ b/drivers/md/dm-vdo/indexer/indexer.h @@ -8,6 +8,7 @@ #include #include +#include #include #include @@ -73,7 +74,7 @@ enum uds_request_type { /* Remove any mapping for a name. */ UDS_DELETE, -}; +} __packed; enum uds_open_index_type { /* Create a new index. */ @@ -226,7 +227,7 @@ struct uds_zone_message { enum uds_zone_message_type type; /* The virtual chapter number to which the message applies */ u64 virtual_chapter; -}; +} __packed; struct uds_index_session; struct uds_index; @@ -253,34 +254,32 @@ struct uds_request { /* The existing data associated with the request name, if any */ struct uds_record_data old_metadata; - /* Either UDS_SUCCESS or an error code for the request */ - int status; /* True if the record name had an existing entry in the index */ bool found; + /* Either UDS_SUCCESS or an error code for the request */ + int status; - /* - * The remaining fields are used internally and should not be altered by clients. The index - * relies on zone_number being the first field in this section. - */ - - /* The number of the zone which will process this request*/ - unsigned int zone_number; - /* A link for adding a request to a lock-free queue */ - struct funnel_queue_entry queue_link; - /* A link for adding a request to a standard linked list */ - struct uds_request *next_request; - /* A pointer to the index processing this request */ - struct uds_index *index; - /* Control message for coordinating between zones */ - struct uds_zone_message zone_message; - /* If true, process request immediately by waking the worker thread */ - bool unbatched; - /* If true, continue this request before processing newer requests */ - bool requeued; - /* The virtual chapter containing the record name, if known */ - u64 virtual_chapter; - /* The region of the index containing the record name */ - enum uds_index_region location; + /* The remaining fields are used internally and should not be altered by clients. */ + struct_group(internal, + /* The virtual chapter containing the record name, if known */ + u64 virtual_chapter; + /* The region of the index containing the record name */ + enum uds_index_region location; + /* If true, process request immediately by waking the worker thread */ + bool unbatched; + /* If true, continue this request before processing newer requests */ + bool requeued; + /* Control message for coordinating between zones */ + struct uds_zone_message zone_message; + /* The number of the zone which will process this request*/ + unsigned int zone_number; + /* A link for adding a request to a lock-free queue */ + struct funnel_queue_entry queue_link; + /* A link for adding a request to a standard linked list */ + struct uds_request *next_request; + /* A pointer to the index processing this request */ + struct uds_index *index; + ); }; /* A session is required for most index operations. */ -- cgit v1.2.3 From 5da692e2262b8f81993baa9592f57d12c2703dea Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 6 Mar 2025 16:41:50 +0800 Subject: dm cache: prevent BUG_ON by blocking retries on failed device resumes A cache device failing to resume due to mapping errors should not be retried, as the failure leaves a partially initialized policy object. Repeating the resume operation risks triggering BUG_ON when reloading cache mappings into the incomplete policy object. Reproduce steps: 1. create a cache metadata consisting of 512 or more cache blocks, with some mappings stored in the first array block of the mapping array. Here we use cache_restore v1.0 to build the metadata. cat <> cmeta.xml EOF dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2 dmsetup remove cmeta 2. wipe the second array block of the mapping array to simulate data degradations. mapping_root=$(dd if=/dev/sdc bs=1c count=8 skip=192 \ 2>/dev/null | hexdump -e '1/8 "%u\n"') ablock=$(dd if=/dev/sdc bs=1c count=8 skip=$((4096*mapping_root+2056)) \ 2>/dev/null | hexdump -e '1/8 "%u\n"') dd if=/dev/zero of=/dev/sdc bs=4k count=1 seek=$ablock 3. try bringing up the cache device. The resume is expected to fail due to the broken array block. dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524288 linear /dev/sdc 262144" dmsetup create cache --notable dmsetup load cache --table "0 524288 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" dmsetup resume cache 4. try resuming the cache again. An unexpected BUG_ON is triggered while loading cache mappings. dmsetup resume cache Kernel logs: (snip) ------------[ cut here ]------------ kernel BUG at drivers/md/dm-cache-policy-smq.c:752! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 UID: 0 PID: 332 Comm: dmsetup Not tainted 6.13.4 #3 RIP: 0010:smq_load_mapping+0x3e5/0x570 Fix by disallowing resume operations for devices that failed the initial attempt. Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 9cb797a561d6..6cee5eac8b9e 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2899,6 +2899,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) return to_cblock(size); } +static bool can_resume(struct cache *cache) +{ + /* + * Disallow retrying the resume operation for devices that failed the + * first resume attempt, as the failure leaves the policy object partially + * initialized. Retrying could trigger BUG_ON when loading cache mappings + * into the incomplete policy object. + */ + if (cache->sized && !cache->loaded_mappings) { + if (get_cache_mode(cache) != CM_WRITE) + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", + cache_device_name(cache)); + else + DMERR("%s: unable to resume cache due to missing proper cache table reload", + cache_device_name(cache)); + return false; + } + + return true; +} + static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { @@ -2947,6 +2968,9 @@ static int cache_preresume(struct dm_target *ti) struct cache *cache = ti->private; dm_cblock_t csize = get_cache_dev_size(cache); + if (!can_resume(cache)) + return -EINVAL; + /* * Check to see if the cache has resized. */ -- cgit v1.2.3 From c2662b1544cbd8ea3181381bb899b8e681dfedc7 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 6 Mar 2025 16:41:51 +0800 Subject: dm cache: support shrinking the origin device This patch introduces formal support for shrinking the cache origin by reducing the cache target length via table reloads. Cache blocks mapped beyond the new target length must be clean and are invalidated during preresume. If any dirty blocks exist in the area being removed, the preresume operation fails without setting the NEEDS_CHECK flag in superblock, and the resume ioctl returns EFBIG. The cache device remains suspended until a table reload with target length that fits existing mappings is performed. Without this patch, reducing the cache target length could result in io errors (RHBZ: 2134334), out-of-bounds memory access to the discard bitset, and security concerns regarding data leakage. Verification steps: 1. create a cache metadata with some cached blocks mapped to the tail of the origin device. Here we use cache_restore v1.0 to build a metadata with one clean block mapped to the last origin block. cat <> cmeta.xml EOF dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2 dmsetup remove cmeta 2. bring up the cache whilst shrinking the cache origin by one block: dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524160 linear /dev/sdc 262144" dmsetup create cache --table "0 524160 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" 3. check the number of cached data blocks via dmsetup status. It is expected to be zero. dmsetup status cache | cut -d ' ' -f 7 In addition to the script above, this patch can be verified using the "cache/resize" tests in dmtest-python: ./dmtest run --rx cache/resize/shrink_origin --result-set default Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka --- drivers/md/dm-cache-target.c | 72 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 3 deletions(-) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 6cee5eac8b9e..a10d75a562db 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -406,6 +406,12 @@ struct cache { mempool_t migration_pool; struct bio_set bs; + + /* + * Cache_size entries. Set bits indicate blocks mapped beyond the + * target length, which are marked for invalidation. + */ + unsigned long *invalid_bitset; }; struct per_bio_data { @@ -1922,6 +1928,9 @@ static void __destroy(struct cache *cache) if (cache->discard_bitset) free_bitset(cache->discard_bitset); + if (cache->invalid_bitset) + free_bitset(cache->invalid_bitset); + if (cache->copier) dm_kcopyd_client_destroy(cache->copier); @@ -2510,6 +2519,13 @@ static int cache_create(struct cache_args *ca, struct cache **result) } clear_bitset(cache->discard_bitset, from_dblock(cache->discard_nr_blocks)); + cache->invalid_bitset = alloc_bitset(from_cblock(cache->cache_size)); + if (!cache->invalid_bitset) { + *error = "could not allocate bitset for invalid blocks"; + goto bad; + } + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + cache->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle); if (IS_ERR(cache->copier)) { *error = "could not create kcopyd client"; @@ -2808,6 +2824,24 @@ static int load_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, return policy_load_mapping(cache->policy, oblock, cblock, dirty, hint, hint_valid); } +static int load_filtered_mapping(void *context, dm_oblock_t oblock, dm_cblock_t cblock, + bool dirty, uint32_t hint, bool hint_valid) +{ + struct cache *cache = context; + + if (from_oblock(oblock) >= from_oblock(cache->origin_blocks)) { + if (dirty) { + DMERR("%s: unable to shrink origin; cache block %u is dirty", + cache_device_name(cache), from_cblock(cblock)); + return -EFBIG; + } + set_bit(from_cblock(cblock), cache->invalid_bitset); + return 0; + } + + return load_mapping(context, oblock, cblock, dirty, hint, hint_valid); +} + /* * The discard block size in the on disk metadata is not * necessarily the same as we're currently using. So we have to @@ -2962,6 +2996,24 @@ static int resize_cache_dev(struct cache *cache, dm_cblock_t new_size) return 0; } +static int truncate_oblocks(struct cache *cache) +{ + uint32_t nr_blocks = from_cblock(cache->cache_size); + uint32_t i; + int r; + + for_each_set_bit(i, cache->invalid_bitset, nr_blocks) { + r = dm_cache_remove_mapping(cache->cmd, to_cblock(i)); + if (r) { + DMERR_LIMIT("%s: invalidation failed; couldn't update on disk metadata", + cache_device_name(cache)); + return r; + } + } + + return 0; +} + static int cache_preresume(struct dm_target *ti) { int r = 0; @@ -2986,11 +3038,25 @@ static int cache_preresume(struct dm_target *ti) } if (!cache->loaded_mappings) { + /* + * The fast device could have been resized since the last + * failed preresume attempt. To be safe we start by a blank + * bitset for cache blocks. + */ + clear_bitset(cache->invalid_bitset, from_cblock(cache->cache_size)); + r = dm_cache_load_mappings(cache->cmd, cache->policy, - load_mapping, cache); + load_filtered_mapping, cache); if (r) { DMERR("%s: could not load cache mappings", cache_device_name(cache)); - metadata_operation_failed(cache, "dm_cache_load_mappings", r); + if (r != -EFBIG) + metadata_operation_failed(cache, "dm_cache_load_mappings", r); + return r; + } + + r = truncate_oblocks(cache); + if (r) { + metadata_operation_failed(cache, "dm_cache_remove_mapping", r); return r; } @@ -3450,7 +3516,7 @@ static void cache_io_hints(struct dm_target *ti, struct queue_limits *limits) static struct target_type cache_target = { .name = "cache", - .version = {2, 2, 0}, + .version = {2, 3, 0}, .module = THIS_MODULE, .ctr = cache_ctr, .dtr = cache_dtr, -- cgit v1.2.3 From 45fc728515c14f53f6205789de5bfd72a95af3b8 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Mar 2025 13:51:32 +0100 Subject: dm: restrict dm device size to 2^63-512 bytes The devices with size >= 2^63 bytes can't be used reliably by userspace because the type off_t is a signed 64-bit integer. Therefore, we limit the maximum size of a device mapper device to 2^63-512 bytes. Signed-off-by: Mikulas Patocka --- drivers/md/dm-table.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 0ef5203387b2..04b3e9758e53 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -697,6 +697,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { + DMERR("%s: too large device", dm_device_name(t->md)); + return -EINVAL; + } ti->type = dm_get_target_type(type); if (!ti->type) { -- cgit v1.2.3 From d43929ef65a60b4c44a5f85cdce826c4e33a67d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 21 Mar 2025 08:18:16 +0100 Subject: dm-delay: support zoned devices Add support for zoned device by passing through report_zoned to the underlying read device. This is required to make enable xfstests xfs/311 on zoned devices. Signed-off-by: Christoph Hellwig Reviewed-by: Damien Le Moal Signed-off-by: Mikulas Patocka --- drivers/md/dm-delay.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index 08f6387620c1..d4cf0ac2a7aa 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c @@ -369,6 +369,21 @@ static int delay_map(struct dm_target *ti, struct bio *bio) return delay_bio(dc, c, bio); } +#ifdef CONFIG_BLK_DEV_ZONED +static int delay_report_zones(struct dm_target *ti, + struct dm_report_zones_args *args, unsigned int nr_zones) +{ + struct delay_c *dc = ti->private; + struct delay_class *c = &dc->read; + + return dm_report_zones(c->dev->bdev, c->start, + c->start + dm_target_offset(ti, args->next_sector), + args, nr_zones); +} +#else +#define delay_report_zones NULL +#endif + #define DMEMIT_DELAY_CLASS(c) \ DMEMIT("%s %llu %u", (c)->dev->name, (unsigned long long)(c)->start, (c)->delay) @@ -424,11 +439,12 @@ out: static struct target_type delay_target = { .name = "delay", .version = {1, 4, 0}, - .features = DM_TARGET_PASSES_INTEGRITY, + .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM, .module = THIS_MODULE, .ctr = delay_ctr, .dtr = delay_dtr, .map = delay_map, + .report_zones = delay_report_zones, .presuspend = delay_presuspend, .resume = delay_resume, .status = delay_status, -- cgit v1.2.3 From 5c5d0d7050286e14a6ca18b8d77fc7a34f701206 Mon Sep 17 00:00:00 2001 From: LongPing Wei Date: Thu, 27 Mar 2025 10:18:19 +0800 Subject: dm-verity: support block number limits for different ioprio classes Calling verity_verify_io in bh for IO of all sizes is not suitable for embedded devices. From our tests, it can improve the performance of 4K synchronise random reads. For example: ./fio --name=rand_read --ioengine=psync --rw=randread --bs=4K \ --direct=1 --numjobs=8 --runtime=60 --time_based --group_reporting \ --filename=/dev/block/mapper/xx-verity But it will degrade the performance of 512K synchronise sequential reads on our devices. For example: ./fio --name=read --ioengine=psync --rw=read --bs=512K --direct=1 \ --numjobs=8 --runtime=60 --time_based --group_reporting \ --filename=/dev/block/mapper/xx-verity A parameter array is introduced by this change. And users can modify the default config by /sys/module/dm_verity/parameters/use_bh_bytes. The default limits for NONE/RT/BE is set to 8192. The default limits for IDLE is set to 0. Call verity_verify_io directly when verity_end_io is not in hardirq. Signed-off-by: LongPing Wei Signed-off-by: Mikulas Patocka --- Documentation/admin-guide/device-mapper/verity.rst | 11 ++++++-- drivers/md/dm-verity-target.c | 29 +++++++++++++++++++--- 2 files changed, 35 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/device-mapper/verity.rst b/Documentation/admin-guide/device-mapper/verity.rst index fb3a045d0c72..8c3f1f967a3c 100644 --- a/Documentation/admin-guide/device-mapper/verity.rst +++ b/Documentation/admin-guide/device-mapper/verity.rst @@ -151,8 +151,15 @@ root_hash_sig_key_desc already in the secondary trusted keyring. try_verify_in_tasklet - If verity hashes are in cache, verify data blocks in kernel tasklet instead - of workqueue. This option can reduce IO latency. + If verity hashes are in cache and the IO size does not exceed the limit, + verify data blocks in bottom half instead of workqueue. This option can + reduce IO latency. The size limits can be configured via + /sys/module/dm_verity/parameters/use_bh_bytes. The four parameters + correspond to limits for IOPRIO_CLASS_NONE, IOPRIO_CLASS_RT, + IOPRIO_CLASS_BE and IOPRIO_CLASS_IDLE in turn. + For example: + ,,, + 4096,4096,4096,4096 Theory of operation =================== diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index fc5ae123670b..40448a8c74b1 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -30,6 +30,7 @@ #define DM_VERITY_ENV_VAR_NAME "DM_VERITY_ERR_BLOCK_NR" #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 +#define DM_VERITY_USE_BH_DEFAULT_BYTES 8192 #define DM_VERITY_MAX_CORRUPTED_ERRS 100 @@ -49,6 +50,15 @@ static unsigned int dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, 0644); +static unsigned int dm_verity_use_bh_bytes[4] = { + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_NONE + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_RT + DM_VERITY_USE_BH_DEFAULT_BYTES, // IOPRIO_CLASS_BE + 0 // IOPRIO_CLASS_IDLE +}; + +module_param_array_named(use_bh_bytes, dm_verity_use_bh_bytes, uint, NULL, 0644); + static DEFINE_STATIC_KEY_FALSE(use_bh_wq_enabled); /* Is at least one dm-verity instance using ahash_tfm instead of shash_tfm? */ @@ -669,9 +679,17 @@ static void verity_bh_work(struct work_struct *w) verity_finish_io(io, errno_to_blk_status(err)); } +static inline bool verity_use_bh(unsigned int bytes, unsigned short ioprio) +{ + return ioprio <= IOPRIO_CLASS_IDLE && + bytes <= READ_ONCE(dm_verity_use_bh_bytes[ioprio]); +} + static void verity_end_io(struct bio *bio) { struct dm_verity_io *io = bio->bi_private; + unsigned short ioprio = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + unsigned int bytes = io->n_blocks << io->v->data_dev_block_bits; if (bio->bi_status && (!verity_fec_is_enabled(io->v) || @@ -681,9 +699,14 @@ static void verity_end_io(struct bio *bio) return; } - if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq) { - INIT_WORK(&io->bh_work, verity_bh_work); - queue_work(system_bh_wq, &io->bh_work); + if (static_branch_unlikely(&use_bh_wq_enabled) && io->v->use_bh_wq && + verity_use_bh(bytes, ioprio)) { + if (in_hardirq() || irqs_disabled()) { + INIT_WORK(&io->bh_work, verity_bh_work); + queue_work(system_bh_wq, &io->bh_work); + } else { + verity_bh_work(&io->bh_work); + } } else { INIT_WORK(&io->work, verity_work); queue_work(io->v->verify_wq, &io->work); -- cgit v1.2.3 From 8bde1033f9cfc1c08628255cc434c6cf39c9d9ba Mon Sep 17 00:00:00 2001 From: Jo Van Bulck Date: Fri, 28 Mar 2025 16:04:47 +0100 Subject: dm-integrity: fix non-constant-time tag verification When using dm-integrity in standalone mode with a keyed hmac algorithm, integrity tags are calculated and verified internally. Using plain memcmp to compare the stored and computed tags may leak the position of the first byte mismatch through side-channel analysis, allowing to brute-force expected tags in linear time (e.g., by counting single-stepping interrupts in confidential virtual machine environments). Co-developed-by: Luca Wilke Signed-off-by: Luca Wilke Signed-off-by: Jo Van Bulck Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-integrity.c | 45 ++++++++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index f41e64f1dab2..516822007921 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -516,7 +517,7 @@ static int sb_mac(struct dm_integrity_c *ic, bool wr) dm_integrity_io_error(ic, "crypto_shash_digest", r); return r; } - if (memcmp(mac, actual_mac, mac_size)) { + if (crypto_memneq(mac, actual_mac, mac_size)) { dm_integrity_io_error(ic, "superblock mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-superblock", ic->ti, 0); return -EILSEQ; @@ -859,7 +860,7 @@ static void rw_section_mac(struct dm_integrity_c *ic, unsigned int section, bool if (likely(wr)) memcpy(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR); else { - if (memcmp(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { + if (crypto_memneq(&js->mac, result + (j * JOURNAL_MAC_PER_SECTOR), JOURNAL_MAC_PER_SECTOR)) { dm_integrity_io_error(ic, "journal mac", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "mac-journal", ic->ti, 0); } @@ -1401,10 +1402,9 @@ static bool find_newer_committed_node(struct dm_integrity_c *ic, struct journal_ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, sector_t *metadata_block, unsigned int *metadata_offset, unsigned int total_size, int op) { -#define MAY_BE_FILLER 1 -#define MAY_BE_HASH 2 unsigned int hash_offset = 0; - unsigned int may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + unsigned char mismatch_hash = 0; + unsigned char mismatch_filler = !ic->discard; do { unsigned char *data, *dp; @@ -1425,7 +1425,7 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se if (op == TAG_READ) { memcpy(tag, dp, to_copy); } else if (op == TAG_WRITE) { - if (memcmp(dp, tag, to_copy)) { + if (crypto_memneq(dp, tag, to_copy)) { memcpy(dp, tag, to_copy); dm_bufio_mark_partial_buffer_dirty(b, *metadata_offset, *metadata_offset + to_copy); } @@ -1433,29 +1433,30 @@ static int dm_integrity_rw_tag(struct dm_integrity_c *ic, unsigned char *tag, se /* e.g.: op == TAG_CMP */ if (likely(is_power_of_2(ic->tag_size))) { - if (unlikely(memcmp(dp, tag, to_copy))) - if (unlikely(!ic->discard) || - unlikely(memchr_inv(dp, DISCARD_FILLER, to_copy) != NULL)) { - goto thorough_test; - } + if (unlikely(crypto_memneq(dp, tag, to_copy))) + goto thorough_test; } else { unsigned int i, ts; thorough_test: ts = total_size; for (i = 0; i < to_copy; i++, ts--) { - if (unlikely(dp[i] != tag[i])) - may_be &= ~MAY_BE_HASH; - if (likely(dp[i] != DISCARD_FILLER)) - may_be &= ~MAY_BE_FILLER; + /* + * Warning: the control flow must not be + * dependent on match/mismatch of + * individual bytes. + */ + mismatch_hash |= dp[i] ^ tag[i]; + mismatch_filler |= dp[i] ^ DISCARD_FILLER; hash_offset++; if (unlikely(hash_offset == ic->tag_size)) { - if (unlikely(!may_be)) { + if (unlikely(mismatch_hash) && unlikely(mismatch_filler)) { dm_bufio_release(b); return ts; } hash_offset = 0; - may_be = MAY_BE_HASH | (ic->discard ? MAY_BE_FILLER : 0); + mismatch_hash = 0; + mismatch_filler = !ic->discard; } } } @@ -1476,8 +1477,6 @@ thorough_test: } while (unlikely(total_size)); return 0; -#undef MAY_BE_FILLER -#undef MAY_BE_HASH } struct flush_request { @@ -2076,7 +2075,7 @@ retry_kmap: char checksums_onstack[MAX_T(size_t, HASH_MAX_DIGESTSIZE, MAX_TAG_SIZE)]; integrity_sector_checksum(ic, logical_sector, mem + bv.bv_offset, checksums_onstack); - if (unlikely(memcmp(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { + if (unlikely(crypto_memneq(checksums_onstack, journal_entry_tag(ic, je), ic->tag_size))) { DMERR_LIMIT("Checksum failed when reading from journal, at sector 0x%llx", logical_sector); dm_audit_log_bio(DM_MSG_PREFIX, "journal-checksum", @@ -2595,7 +2594,7 @@ static void dm_integrity_inline_recheck(struct work_struct *w) bio_put(outgoing_bio); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, outgoing_data, digest); - if (unlikely(memcmp(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { + if (unlikely(crypto_memneq(digest, dio->integrity_payload, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { DMERR_LIMIT("%pg: Checksum failed at sector 0x%llx", ic->dev->bdev, dio->bio_details.bi_iter.bi_sector); atomic64_inc(&ic->number_of_mismatches); @@ -2634,7 +2633,7 @@ static int dm_integrity_end_io(struct dm_target *ti, struct bio *bio, blk_status char *mem = bvec_kmap_local(&bv); //memset(mem, 0xff, ic->sectors_per_block << SECTOR_SHIFT); integrity_sector_checksum(ic, dio->bio_details.bi_iter.bi_sector, mem, digest); - if (unlikely(memcmp(digest, dio->integrity_payload + pos, + if (unlikely(crypto_memneq(digest, dio->integrity_payload + pos, min(crypto_shash_digestsize(ic->internal_hash), ic->tag_size)))) { kunmap_local(mem); dm_integrity_free_payload(dio); @@ -2911,7 +2910,7 @@ static void do_journal_write(struct dm_integrity_c *ic, unsigned int write_start integrity_sector_checksum(ic, sec + ((l - j) << ic->sb->log2_sectors_per_block), (char *)access_journal_data(ic, i, l), test_tag); - if (unlikely(memcmp(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { + if (unlikely(crypto_memneq(test_tag, journal_entry_tag(ic, je2), ic->tag_size))) { dm_integrity_io_error(ic, "tag mismatch when replaying journal", -EILSEQ); dm_audit_log_target(DM_MSG_PREFIX, "integrity-replay-journal", ic->ti, 0); } -- cgit v1.2.3 From 2de510fccbca3d1906b55f4be5f1de83fa2424ef Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 28 Mar 2025 16:17:45 +0100 Subject: dm-verity: fix prefetch-vs-suspend race There's a possible race condition in dm-verity - the prefetch work item may race with suspend and it is possible that prefetch continues to run while the device is suspended. Fix this by calling flush_workqueue and dm_bufio_client_reset in the postsuspend hook. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-verity-target.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c index 40448a8c74b1..3c427f18a04b 100644 --- a/drivers/md/dm-verity-target.c +++ b/drivers/md/dm-verity-target.c @@ -836,6 +836,13 @@ static int verity_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_SUBMITTED; } +static void verity_postsuspend(struct dm_target *ti) +{ + struct dm_verity *v = ti->private; + flush_workqueue(v->verify_wq); + dm_bufio_client_reset(v->bufio); +} + /* * Status: V (valid) or C (corruption found) */ @@ -1806,6 +1813,7 @@ static struct target_type verity_target = { .ctr = verity_ctr, .dtr = verity_dtr, .map = verity_map, + .postsuspend = verity_postsuspend, .status = verity_status, .prepare_ioctl = verity_prepare_ioctl, .iterate_devices = verity_iterate_devices, -- cgit v1.2.3 From 9c565428788fb9b49066f94ab7b10efc686a0a4c Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 28 Mar 2025 16:19:07 +0100 Subject: dm-ebs: fix prefetch-vs-suspend race There's a possible race condition in dm-ebs - dm bufio prefetch may be in progress while the device is suspended. Fix this by calling dm_bufio_client_reset in the postsuspend hook. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org --- drivers/md/dm-ebs-target.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/md/dm-ebs-target.c b/drivers/md/dm-ebs-target.c index 18ae45dcbfb2..b19b0142a690 100644 --- a/drivers/md/dm-ebs-target.c +++ b/drivers/md/dm-ebs-target.c @@ -390,6 +390,12 @@ static int ebs_map(struct dm_target *ti, struct bio *bio) return DM_MAPIO_REMAPPED; } +static void ebs_postsuspend(struct dm_target *ti) +{ + struct ebs_c *ec = ti->private; + dm_bufio_client_reset(ec->bufio); +} + static void ebs_status(struct dm_target *ti, status_type_t type, unsigned int status_flags, char *result, unsigned int maxlen) { @@ -447,6 +453,7 @@ static struct target_type ebs_target = { .ctr = ebs_ctr, .dtr = ebs_dtr, .map = ebs_map, + .postsuspend = ebs_postsuspend, .status = ebs_status, .io_hints = ebs_io_hints, .prepare_ioctl = ebs_prepare_ioctl, -- cgit v1.2.3