From 2e96d2d8c2a7a6c2cef45593c028d9c5ef180316 Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Sat, 28 Jun 2025 11:12:32 -0700 Subject: nvme: Fix incorrect cdw15 value in passthru error logging Fix an error in nvme_log_err_passthru() where cdw14 was incorrectly printed twice instead of cdw15. This fix ensures accurate logging of the full passthrough command payload. Fixes: 9f079dda1433 ("nvme: allow passthru cmd error logging") Signed-off-by: Alok Tiwari Reviewed-by: Martin K. Petersen Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index e533d791955d..2ae36bce615e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -386,7 +386,7 @@ static void nvme_log_err_passthru(struct request *req) nr->cmd->common.cdw12, nr->cmd->common.cdw13, nr->cmd->common.cdw14, - nr->cmd->common.cdw14); + nr->cmd->common.cdw15); } enum nvme_disposition { -- cgit v1.2.3 From ba806c900379899e5cdd6ca165b900e2081e1c99 Mon Sep 17 00:00:00 2001 From: Nilay Shroff Date: Thu, 26 Jun 2025 10:49:19 +0530 Subject: nvme: correctly account for namespace head reference counter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The blktests nvme/058 manifests an issue where the NVMe subsystem kobject entry remains stale in sysfs, causing a failure during subsequent NVMe module reloads[1]. Specifically, when attempting to register a new NVMe subsystem, the driver encounters a kobejct name collision because a stale kobject still exists. Though, please note that nvme/058 doesn't report any failure and test case passes and it's only during subsequent NVMe module reloads, the stale nvme sub- system kobject entry in sysfs causes the observed symptom[1]. This issue stems from an imbalance in the get/put usage of the namespace head (nshead) reference counter. The nshead holds a reference to the associated NVMe subsystem. If the nshead reference is not properly released, it prevents the cleanup of the subsystem's kobject, leaving nvme subsystem stale entry behind in sysfs. During the failure case, the last namespace path referencing a nshead is removed, but the nshead reference was not released. This occurs because the release logic currently only puts the nshead reference when its state is LIVE. However, in configurations where ANA (Asymmetric Namespace Access) is enabled, a namespace may be associated with an ANA state that is neither optimized nor non-optimized. In this case, the nshead may never transition to LIVE, and the corresponding nshead reference is then never dropped. In fact nvme/058 associates some of nvme namespaces to an inaccessible ANA state and with that nshead is created but it's state is not transitioned to LIVE. So the current logic would then causes nshead reference to be leaked for non-LIVE states. Another scenario, during namespace allocation, the driver first allocates a nshead and then issues an Identify Namespace command. If this command fails — which can happen in tests like nvme/058 that rapidly enables and disables namespaces — we must release the reference to the newly allocated nshead. However this reference release is currently missing in the failure, causing a nshead reference leak. To fix this, we now unconditionally release the nshead reference when the last nvme path referencing to the nshead is removed, regardless of the head’s state. Also during identify namespace failure case we now properly release the nshead reference. So this ensures proper cleanup of the nshead, and consequently, the NVMe subsystem and its associated kobject. This change prevents stale kobject entries from lingering in sysfs and eliminates the module reload failures observed just after running nvme/058. [1] https://lore.kernel.org/all/CAHj4cs8fOBS-eSjsd5LUBzy7faKXJtgLkCN+mDy_-ezCLLLq+Q@mail.gmail.com/ Reported-by: yi.zhang@redhat.com Closes: https://lore.kernel.org/all/CAHj4cs8fOBS-eSjsd5LUBzy7faKXJtgLkCN+mDy_-ezCLLLq+Q@mail.gmail.com/ Fixes: 62188639ec16 ("nvme-multipath: introduce delayed removal of the multipath head node") Tested-by: yi.zhang@redhat.com Signed-off-by: Nilay Shroff Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 16 +++++++++++++++- drivers/nvme/host/multipath.c | 5 ++++- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2ae36bce615e..7493e5aa984c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4086,6 +4086,7 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) struct nvme_ns *ns; struct gendisk *disk; int node = ctrl->numa_node; + bool last_path = false; ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -4178,9 +4179,22 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info) out_unlink_ns: mutex_lock(&ctrl->subsys->lock); list_del_rcu(&ns->siblings); - if (list_empty(&ns->head->list)) + if (list_empty(&ns->head->list)) { list_del_init(&ns->head->entry); + /* + * If multipath is not configured, we still create a namespace + * head (nshead), but head->disk is not initialized in that + * case. As a result, only a single reference to nshead is held + * (via kref_init()) when it is created. Therefore, ensure that + * we do not release the reference to nshead twice if head->disk + * is not present. + */ + if (ns->head->disk) + last_path = true; + } mutex_unlock(&ctrl->subsys->lock); + if (last_path) + nvme_put_ns_head(ns->head); nvme_put_ns_head(ns->head); out_cleanup_disk: put_disk(disk); diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 1062467595f3..e03f3a29bd75 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -690,8 +690,8 @@ static void nvme_remove_head(struct nvme_ns_head *head) nvme_cdev_del(&head->cdev, &head->cdev_device); synchronize_srcu(&head->srcu); del_gendisk(head->disk); - nvme_put_ns_head(head); } + nvme_put_ns_head(head); } static void nvme_remove_head_work(struct work_struct *work) @@ -1291,6 +1291,9 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head) { bool remove = false; + if (!head->disk) + return; + mutex_lock(&head->subsys->lock); /* * We are called when all paths have been removed, and at that point -- cgit v1.2.3 From 190f4c2c863af7cc5bb354b70e0805f06419c038 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Wed, 25 Jun 2025 14:45:33 +0300 Subject: nvmet: fix memory leak of bio integrity If nvmet receives commands with metadata there is a continuous memory leak of kmalloc-128 slab or more precisely bio->bi_integrity. Since commit bf4c89fc8797 ("block: don't call bio_uninit from bio_endio") each user of bio_init has to use bio_uninit as well. Otherwise the bio integrity is not getting free. Nvmet uses bio_init for inline bios. Uninit the inline bio to complete deallocation of integrity in bio. Fixes: bf4c89fc8797 ("block: don't call bio_uninit from bio_endio") Signed-off-by: Dmitry Bogdanov Signed-off-by: Christoph Hellwig --- drivers/nvme/target/nvmet.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h index df69a9dee71c..51df72f5e89b 100644 --- a/drivers/nvme/target/nvmet.h +++ b/drivers/nvme/target/nvmet.h @@ -867,6 +867,8 @@ static inline void nvmet_req_bio_put(struct nvmet_req *req, struct bio *bio) { if (bio != &req->b.inline_bio) bio_put(bio); + else + bio_uninit(bio); } #ifdef CONFIG_NVME_TARGET_TCP_TLS -- cgit v1.2.3 From 14005c96d6649b27fa52d0cd0f492eb3b5586c07 Mon Sep 17 00:00:00 2001 From: Eugen Hristev Date: Fri, 13 Jun 2025 16:21:01 -0300 Subject: nvme-pci: refresh visible attrs after being checked MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sysfs attributes are registered early, but the driver does not know whether they are needed or not at that moment. For the CMB attributes, commit e917a849c3fc ("nvme-pci: refresh visible attrs for cmb attributes") solved this problem by calling nvme_update_attrs after mapping the CMB. However the issue persists for the HMB attributes. To solve the problem, moved the call to nvme_update_attrs after nvme_setup_host_mem, which sets up the HMB. Fixes: e917a849c3fc ("nvme-pci: refresh visible attrs for cmb attributes") Fixes: 86adbf0cdb9e ("nvme: simplify transport specific device attribute handling") Signed-off-by: Eugen Hristev Signed-off-by: André Almeida Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8ff12e415cb5..320aaa41ec39 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2101,8 +2101,6 @@ static void nvme_map_cmb(struct nvme_dev *dev) if ((dev->cmbsz & (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) == (NVME_CMBSZ_WDS | NVME_CMBSZ_RDS)) pci_p2pmem_publish(pdev, true); - - nvme_update_attrs(dev); } static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits) @@ -3010,6 +3008,8 @@ static void nvme_reset_work(struct work_struct *work) if (result < 0) goto out; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out; @@ -3343,6 +3343,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result < 0) goto out_disable; + nvme_update_attrs(dev); + result = nvme_setup_io_queues(dev); if (result) goto out_disable; -- cgit v1.2.3 From d6811074203b13f715ce2480ac64c5b1c773f2a5 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 30 Jun 2025 15:48:32 +0800 Subject: nvme-multipath: fix suspicious RCU usage warning When I run the NVME over TCP test in virtme-ng, I get the following "suspicious RCU usage" warning in nvme_mpath_add_sysfs_link(): ''' [ 5.024557][ T44] nvmet: Created nvm controller 1 for subsystem nqn.2025-06.org.nvmexpress.mptcp for NQN nqn.2014-08.org.nvmexpress:uuid:f7f6b5e0-ff97-4894-98ac-c85309e0bc77. [ 5.027401][ T183] nvme nvme0: creating 2 I/O queues. [ 5.029017][ T183] nvme nvme0: mapped 2/0/0 default/read/poll queues. [ 5.032587][ T183] nvme nvme0: new ctrl: NQN "nqn.2025-06.org.nvmexpress.mptcp", addr 127.0.0.1:4420, hostnqn: nqn.2014-08.org.nvmexpress:uuid:f7f6b5e0-ff97-4894-98ac-c85309e0bc77 [ 5.042214][ T25] [ 5.042440][ T25] ============================= [ 5.042579][ T25] WARNING: suspicious RCU usage [ 5.042705][ T25] 6.16.0-rc3+ #23 Not tainted [ 5.042812][ T25] ----------------------------- [ 5.042934][ T25] drivers/nvme/host/multipath.c:1203 RCU-list traversed in non-reader section!! [ 5.043111][ T25] [ 5.043111][ T25] other info that might help us debug this: [ 5.043111][ T25] [ 5.043341][ T25] [ 5.043341][ T25] rcu_scheduler_active = 2, debug_locks = 1 [ 5.043502][ T25] 3 locks held by kworker/u9:0/25: [ 5.043615][ T25] #0: ffff888008730948 ((wq_completion)async){+.+.}-{0:0}, at: process_one_work+0x7ed/0x1350 [ 5.043830][ T25] #1: ffffc900001afd40 ((work_completion)(&entry->work)){+.+.}-{0:0}, at: process_one_work+0xcf3/0x1350 [ 5.044084][ T25] #2: ffff888013ee0020 (&head->srcu){.+.+}-{0:0}, at: nvme_mpath_add_sysfs_link.part.0+0xb4/0x3a0 [ 5.044300][ T25] [ 5.044300][ T25] stack backtrace: [ 5.044439][ T25] CPU: 0 UID: 0 PID: 25 Comm: kworker/u9:0 Not tainted 6.16.0-rc3+ #23 PREEMPT(full) [ 5.044441][ T25] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 5.044442][ T25] Workqueue: async async_run_entry_fn [ 5.044445][ T25] Call Trace: [ 5.044446][ T25] [ 5.044449][ T25] dump_stack_lvl+0x6f/0xb0 [ 5.044453][ T25] lockdep_rcu_suspicious.cold+0x4f/0xb1 [ 5.044457][ T25] nvme_mpath_add_sysfs_link.part.0+0x2fb/0x3a0 [ 5.044459][ T25] ? queue_work_on+0x90/0xf0 [ 5.044461][ T25] ? lockdep_hardirqs_on+0x78/0x110 [ 5.044466][ T25] nvme_mpath_set_live+0x1e9/0x4f0 [ 5.044470][ T25] nvme_mpath_add_disk+0x240/0x2f0 [ 5.044472][ T25] ? __pfx_nvme_mpath_add_disk+0x10/0x10 [ 5.044475][ T25] ? add_disk_fwnode+0x361/0x580 [ 5.044480][ T25] nvme_alloc_ns+0x81c/0x17c0 [ 5.044483][ T25] ? kasan_quarantine_put+0x104/0x240 [ 5.044487][ T25] ? __pfx_nvme_alloc_ns+0x10/0x10 [ 5.044495][ T25] ? __pfx_nvme_find_get_ns+0x10/0x10 [ 5.044496][ T25] ? rcu_read_lock_any_held+0x45/0xa0 [ 5.044498][ T25] ? validate_chain+0x232/0x4f0 [ 5.044503][ T25] nvme_scan_ns+0x4c8/0x810 [ 5.044506][ T25] ? __pfx_nvme_scan_ns+0x10/0x10 [ 5.044508][ T25] ? find_held_lock+0x2b/0x80 [ 5.044512][ T25] ? ktime_get+0x16d/0x220 [ 5.044517][ T25] ? kvm_clock_get_cycles+0x18/0x30 [ 5.044520][ T25] ? __pfx_nvme_scan_ns_async+0x10/0x10 [ 5.044522][ T25] async_run_entry_fn+0x97/0x560 [ 5.044523][ T25] ? rcu_is_watching+0x12/0xc0 [ 5.044526][ T25] process_one_work+0xd3c/0x1350 [ 5.044532][ T25] ? __pfx_process_one_work+0x10/0x10 [ 5.044536][ T25] ? assign_work+0x16c/0x240 [ 5.044539][ T25] worker_thread+0x4da/0xd50 [ 5.044545][ T25] ? __pfx_worker_thread+0x10/0x10 [ 5.044546][ T25] kthread+0x356/0x5c0 [ 5.044548][ T25] ? __pfx_kthread+0x10/0x10 [ 5.044549][ T25] ? ret_from_fork+0x1b/0x2e0 [ 5.044552][ T25] ? __lock_release.isra.0+0x5d/0x180 [ 5.044553][ T25] ? ret_from_fork+0x1b/0x2e0 [ 5.044555][ T25] ? rcu_is_watching+0x12/0xc0 [ 5.044557][ T25] ? __pfx_kthread+0x10/0x10 [ 5.044559][ T25] ret_from_fork+0x218/0x2e0 [ 5.044561][ T25] ? __pfx_kthread+0x10/0x10 [ 5.044562][ T25] ret_from_fork_asm+0x1a/0x30 [ 5.044570][ T25] ''' This patch uses sleepable RCU version of helper list_for_each_entry_srcu() instead of list_for_each_entry_rcu() to fix it. Fixes: 4dbd2b2ebe4c ("nvme-multipath: Add visibility for round-robin io-policy") Signed-off-by: Geliang Tang Reviewed-by: Keith Busch Reviewed-by: Hannes Reinecke Reviewed-by: Nilay Shroff Signed-off-by: Christoph Hellwig --- drivers/nvme/host/multipath.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index e03f3a29bd75..5d1ad28986e9 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -1200,7 +1200,8 @@ void nvme_mpath_add_sysfs_link(struct nvme_ns_head *head) */ srcu_idx = srcu_read_lock(&head->srcu); - list_for_each_entry_rcu(ns, &head->list, siblings) { + list_for_each_entry_srcu(ns, &head->list, siblings, + srcu_read_lock_held(&head->srcu)) { /* * Ensure that ns path disk node is already added otherwise we * may get invalid kobj name for target -- cgit v1.2.3 From 01ed88aea527e19def9070349399684522c66c72 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 1 Jul 2025 15:23:25 +0800 Subject: ublk: don't queue request if the associated uring_cmd is canceled Commit 524346e9d79f ("ublk: build batch from IOs in same io_ring_ctx and io task") need to dereference `io->cmd` for checking if the IO can be added to current batch, see ublk_belong_to_same_batch() and io_uring_cmd_ctx_handle(). However, `io->cmd` may become invalid after the uring_cmd is canceled. Fixes it by only allowing to queue this IO in case that ublk_prep_req() returns `BLK_STS_OK`, when 'io->cmd' is guaranteed to be valid. Reported-by: Changhui Zhong Fixes: 524346e9d79f ("ublk: build batch from IOs in same io_ring_ctx and io task") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250701072325.1458109-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c3e3c3b65a6d..9fd284fa76dc 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1442,15 +1442,16 @@ static void ublk_queue_rqs(struct rq_list *rqlist) struct ublk_queue *this_q = req->mq_hctx->driver_data; struct ublk_io *this_io = &this_q->ios[req->tag]; + if (ublk_prep_req(this_q, req, true) != BLK_STS_OK) { + rq_list_add_tail(&requeue_list, req); + continue; + } + if (io && !ublk_belong_to_same_batch(io, this_io) && !rq_list_empty(&submit_list)) ublk_queue_cmd_list(io, &submit_list); io = this_io; - - if (ublk_prep_req(this_q, req, true) == BLK_STS_OK) - rq_list_add_tail(&submit_list, req); - else - rq_list_add_tail(&requeue_list, req); + rq_list_add_tail(&submit_list, req); } if (!rq_list_empty(&submit_list)) -- cgit v1.2.3 From 0d519bb0de3bf0ac9e6f401d4910fc119062d7be Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 30 Jun 2025 19:28:28 +0800 Subject: brd: fix sleeping function called from invalid context in brd_insert_page() __xa_cmpxchg() is called with rcu_read_lock(), and it will allocate memory if necessary. Fix the problem by moving rcu_read_lock() after __xa_cmpxchg(), meanwhile, it still should be held before xa_unlock(), prevent returned page to be freed by concurrent discard. Fixes: bbcacab2e8ee ("brd: avoid extra xarray lookups on first write") Reported-by: syzbot+ea4c8fd177a47338881a@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/685ec4c9.a00a0220.129264.000c.GAE@google.com/ Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20250630112828.421219-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/brd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/block/brd.c b/drivers/block/brd.c index b1be6c510372..0c2eabe14af3 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -64,13 +64,15 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector, rcu_read_unlock(); page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM); - rcu_read_lock(); - if (!page) + if (!page) { + rcu_read_lock(); return ERR_PTR(-ENOMEM); + } xa_lock(&brd->brd_pages); ret = __xa_cmpxchg(&brd->brd_pages, sector >> PAGE_SECTORS_SHIFT, NULL, page, gfp); + rcu_read_lock(); if (ret) { xa_unlock(&brd->brd_pages); __free_page(page); -- cgit v1.2.3