From 5cadfbd5a11e5495cac217534c5f788168b1afd7 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Mon, 27 Mar 2023 16:14:49 +0200 Subject: fuse: add feature flag for expire-only Add an init flag idicating whether the FUSE_EXPIRE_ONLY flag of FUSE_NOTIFY_INVAL_ENTRY is effective. This is needed for backports of this feature, otherwise the server could just check the protocol version. Fixes: 4f8d37020e1f ("fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY") Cc: # v6.2 Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 3 ++- include/uapi/linux/fuse.h | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d66070af145d..660be31aaabc 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1254,7 +1254,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | - FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP; + FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | + FUSE_HAS_EXPIRE_ONLY; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h index 1b9d0dfae72d..b3fcab13fcd3 100644 --- a/include/uapi/linux/fuse.h +++ b/include/uapi/linux/fuse.h @@ -206,6 +206,7 @@ * - add extension header * - add FUSE_EXT_GROUPS * - add FUSE_CREATE_SUPP_GROUP + * - add FUSE_HAS_EXPIRE_ONLY */ #ifndef _LINUX_FUSE_H @@ -369,6 +370,7 @@ struct fuse_file_lock { * FUSE_HAS_INODE_DAX: use per inode DAX * FUSE_CREATE_SUPP_GROUP: add supplementary group info to create, mkdir, * symlink and mknod (single group that matches parent) + * FUSE_HAS_EXPIRE_ONLY: kernel supports expiry-only entry invalidation */ #define FUSE_ASYNC_READ (1 << 0) #define FUSE_POSIX_LOCKS (1 << 1) @@ -406,6 +408,7 @@ struct fuse_file_lock { #define FUSE_SECURITY_CTX (1ULL << 32) #define FUSE_HAS_INODE_DAX (1ULL << 33) #define FUSE_CREATE_SUPP_GROUP (1ULL << 34) +#define FUSE_HAS_EXPIRE_ONLY (1ULL << 35) /** * CUSE INIT request/reply flags -- cgit v1.2.3 From 1c3610d30e5c15f4db7eb8465e311b582aa50ebe Mon Sep 17 00:00:00 2001 From: zyfjeff Date: Tue, 13 Dec 2022 19:51:47 +0800 Subject: fuse: remove duplicate check for nodeid before this check, the nodeid has already been checked once, so the check here doesn't make an sense, so remove the check for nodeid here. if (err || !outarg->nodeid) goto out_put_forget; err = -EIO; >>> if (!outarg->nodeid) goto out_put_forget; Signed-off-by: zyfjeff Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 35bc174f9ba2..5a4a7155cf1c 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -395,8 +395,6 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name goto out_put_forget; err = -EIO; - if (!outarg->nodeid) - goto out_put_forget; if (fuse_invalid_attr(&outarg->attr)) goto out_put_forget; -- cgit v1.2.3 From 3066ff93476c35679cb07a97cce37d9bb07632ff Mon Sep 17 00:00:00 2001 From: Bernd Schubert Date: Fri, 15 Apr 2022 13:53:56 +0200 Subject: fuse: Apply flags2 only when userspace set the FUSE_INIT_EXT This is just a safety precaution to avoid checking flags on memory that was initialized on the user space side. libfuse zeroes struct fuse_init_out outarg, but this is not guranteed to be done in all implementations. Better is to act on flags and to only apply flags2 when FUSE_INIT_EXT is set. There is a risk with this change, though - it might break existing user space libraries, which are already using flags2 without setting FUSE_INIT_EXT. The corresponding libfuse patch is here https://github.com/libfuse/libfuse/pull/662 Signed-off-by: Bernd Schubert Fixes: 53db28933e95 ("fuse: extend init flags") Cc: # v5.17 Signed-off-by: Miklos Szeredi --- fs/fuse/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 660be31aaabc..f19d748890f0 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1134,7 +1134,10 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, process_init_limits(fc, arg); if (arg->minor >= 6) { - u64 flags = arg->flags | (u64) arg->flags2 << 32; + u64 flags = arg->flags; + + if (flags & FUSE_INIT_EXT) + flags |= (u64) arg->flags2 << 32; ra_pages = arg->max_readahead / PAGE_SIZE; if (flags & FUSE_ASYNC_READ) -- cgit v1.2.3 From a9d1c4c6df0e568207907c04aed9e7beb1294c42 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 7 Jun 2023 17:49:20 +0200 Subject: fuse: revalidate: don't invalidate if interrupted If the LOOKUP request triggered from fuse_dentry_revalidate() is interrupted, then the dentry will be invalidated, possibly resulting in submounts being unmounted. Reported-by: Xu Rongbo Closes: https://lore.kernel.org/all/CAJfpegswN_CJJ6C3RZiaK6rpFmNyWmXfaEpnQUJ42KCwNF5tWw@mail.gmail.com/ Fixes: 9e6268db496a ("[PATCH] FUSE - read-write operations") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5a4a7155cf1c..f67bef9d83c4 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -258,7 +258,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags) spin_unlock(&fi->lock); } kfree(forget); - if (ret == -ENOMEM) + if (ret == -ENOMEM || ret == -EINTR) goto out; if (ret || fuse_invalid_attr(&outarg.attr) || fuse_stale_inode(inode, outarg.generation, &outarg.attr)) -- cgit v1.2.3 From 6a567e920fd0451bf29abc418df96c3365925770 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Wed, 7 Jun 2023 17:49:21 +0200 Subject: fuse: ioctl: translate ENOSYS in outarg Fuse shouldn't return ENOSYS from its ioctl implementation. If userspace responds with ENOSYS it should be translated to ENOTTY. There are two ways to return an error from the IOCTL request: - fuse_out_header.error - fuse_ioctl_out.result Commit 02c0cab8e734 ("fuse: ioctl: translate ENOSYS") already fixed this issue for the first case, but missed the second case. This patch fixes the second case. Reported-by: Jonathan Katz Closes: https://lore.kernel.org/all/CALKgVmcC1VUV_gJVq70n--omMJZUb4HSh_FqvLTHgNBc+HCLFQ@mail.gmail.com/ Fixes: 02c0cab8e734 ("fuse: ioctl: translate ENOSYS") Cc: Signed-off-by: Miklos Szeredi --- fs/fuse/ioctl.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 8e01bfdfc430..726640fa439e 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -9,14 +9,23 @@ #include #include -static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args) +static ssize_t fuse_send_ioctl(struct fuse_mount *fm, struct fuse_args *args, + struct fuse_ioctl_out *outarg) { - ssize_t ret = fuse_simple_request(fm, args); + ssize_t ret; + + args->out_args[0].size = sizeof(*outarg); + args->out_args[0].value = outarg; + + ret = fuse_simple_request(fm, args); /* Translate ENOSYS, which shouldn't be returned from fs */ if (ret == -ENOSYS) ret = -ENOTTY; + if (ret >= 0 && outarg->result == -ENOSYS) + outarg->result = -ENOTTY; + return ret; } @@ -264,13 +273,11 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, } ap.args.out_numargs = 2; - ap.args.out_args[0].size = sizeof(outarg); - ap.args.out_args[0].value = &outarg; ap.args.out_args[1].size = out_size; ap.args.out_pages = true; ap.args.out_argvar = true; - transferred = fuse_send_ioctl(fm, &ap.args); + transferred = fuse_send_ioctl(fm, &ap.args, &outarg); err = transferred; if (transferred < 0) goto out; @@ -399,12 +406,10 @@ static int fuse_priv_ioctl(struct inode *inode, struct fuse_file *ff, args.in_args[1].size = inarg.in_size; args.in_args[1].value = ptr; args.out_numargs = 2; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; args.out_args[1].size = inarg.out_size; args.out_args[1].value = ptr; - err = fuse_send_ioctl(fm, &args); + err = fuse_send_ioctl(fm, &args, &outarg); if (!err) { if (outarg.result < 0) err = outarg.result; -- cgit v1.2.3 From 2c56a751845ddfd3078ebe79981aaaa182629163 Mon Sep 17 00:00:00 2001 From: Fabio Estevam Date: Tue, 20 Jun 2023 08:22:02 -0300 Subject: drm/panel: simple: Add connector_type for innolux_at043tn24 The innolux at043tn24 display is a parallel LCD. Pass the 'connector_type' information to avoid the following warning: panel-simple panel: Specify missing connector_type Signed-off-by: Fabio Estevam Fixes: 41bcceb4de9c ("drm/panel: simple: Add support for Innolux AT043TN24") Reviewed-by: Sam Ravnborg Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20230620112202.654981-1-festevam@gmail.com --- drivers/gpu/drm/panel/panel-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index 065f378bba9d..454a13cca43a 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -2117,6 +2117,7 @@ static const struct panel_desc innolux_at043tn24 = { .height = 54, }, .bus_format = MEDIA_BUS_FMT_RGB888_1X24, + .connector_type = DRM_MODE_CONNECTOR_DPI, .bus_flags = DRM_BUS_FLAG_DE_HIGH | DRM_BUS_FLAG_PIXDATA_DRIVE_POSEDGE, }; -- cgit v1.2.3 From e30cb0599799aac099209e3b045379613c80730e Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Mon, 19 Jun 2023 09:19:21 +0200 Subject: drm/sched: Make sure we wait for all dependencies in kill_jobs_cb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drm_sched_entity_kill_jobs_cb() logic is omitting the last fence popped from the dependency array that was waited upon before drm_sched_entity_kill() was called (drm_sched_entity::dependency field), so we're basically waiting for all dependencies except one. In theory, this wait shouldn't be needed because resources should have their users registered to the dma_resv object, thus guaranteeing that future jobs wanting to access these resources wait on all the previous users (depending on the access type, of course). But we want to keep these explicit waits in the kill entity path just in case. Let's make sure we keep all dependencies in the array in drm_sched_job_dependency(), so we can iterate over the array and wait in drm_sched_entity_kill_jobs_cb(). We also make sure we wait on drm_sched_fence::finished if we were originally asked to wait on drm_sched_fence::scheduled. In that case, we assume the intent was to delegate the wait to the firmware/GPU or rely on the pipelining done at the entity/scheduler level, but when killing jobs, we really want to wait for completion not just scheduling. v2: - Don't evict deps in drm_sched_job_dependency() v3: - Always wait for drm_sched_fence::finished fences in drm_sched_entity_kill_jobs_cb() when we see a sched_fence v4: - Fix commit message - Fix a use-after-free bug v5: - Flag deps on which we should only wait for the scheduled event at insertion time v6: - Back to v4 implementation - Add Christian's R-b Cc: Frank Binns Cc: Sarah Walker Cc: Donald Robson Cc: Luben Tuikov Cc: David Airlie Cc: Daniel Vetter Cc: Sumit Semwal Cc: "Christian König" Signed-off-by: Boris Brezillon Suggested-by: "Christian König" Reviewed-by: "Christian König" Acked-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20230619071921.3465992-1-boris.brezillon@collabora.com --- drivers/gpu/drm/scheduler/sched_entity.c | 41 +++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_entity.c b/drivers/gpu/drm/scheduler/sched_entity.c index e0a8890a62e2..42021d1f7e01 100644 --- a/drivers/gpu/drm/scheduler/sched_entity.c +++ b/drivers/gpu/drm/scheduler/sched_entity.c @@ -155,16 +155,32 @@ static void drm_sched_entity_kill_jobs_cb(struct dma_fence *f, { struct drm_sched_job *job = container_of(cb, struct drm_sched_job, finish_cb); - int r; + unsigned long index; dma_fence_put(f); /* Wait for all dependencies to avoid data corruptions */ - while (!xa_empty(&job->dependencies)) { - f = xa_erase(&job->dependencies, job->last_dependency++); - r = dma_fence_add_callback(f, &job->finish_cb, - drm_sched_entity_kill_jobs_cb); - if (!r) + xa_for_each(&job->dependencies, index, f) { + struct drm_sched_fence *s_fence = to_drm_sched_fence(f); + + if (s_fence && f == &s_fence->scheduled) { + /* The dependencies array had a reference on the scheduled + * fence, and the finished fence refcount might have + * dropped to zero. Use dma_fence_get_rcu() so we get + * a NULL fence in that case. + */ + f = dma_fence_get_rcu(&s_fence->finished); + + /* Now that we have a reference on the finished fence, + * we can release the reference the dependencies array + * had on the scheduled fence. + */ + dma_fence_put(&s_fence->scheduled); + } + + xa_erase(&job->dependencies, index); + if (f && !dma_fence_add_callback(f, &job->finish_cb, + drm_sched_entity_kill_jobs_cb)) return; dma_fence_put(f); @@ -394,8 +410,17 @@ static struct dma_fence * drm_sched_job_dependency(struct drm_sched_job *job, struct drm_sched_entity *entity) { - if (!xa_empty(&job->dependencies)) - return xa_erase(&job->dependencies, job->last_dependency++); + struct dma_fence *f; + + /* We keep the fence around, so we can iterate over all dependencies + * in drm_sched_entity_kill_jobs_cb() to ensure all deps are signaled + * before killing the job. + */ + f = xa_load(&job->dependencies, job->last_dependency); + if (f) { + job->last_dependency++; + return dma_fence_get(f); + } if (job->sched->ops->prepare_job) return job->sched->ops->prepare_job(job, entity); -- cgit v1.2.3 From 98703e4e061fb8715c7613cd227e32cdfd136b23 Mon Sep 17 00:00:00 2001 From: Adrián Larumbe Date: Thu, 1 Jun 2023 13:31:53 +0100 Subject: drm: bridge: dw_hdmi: fix connector access for scdc MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 5d844091f237 ("drm/scdc-helper: Pimp SCDC debugs") changed the scdc interface to pick up an i2c adapter from a connector instead. However, in the case of dw-hdmi, the wrong connector was being used to pass i2c adapter information, since dw-hdmi's embedded connector structure is only populated when the bridge attachment callback explicitly asks for it. drm-meson is handling connector creation, so this won't happen, leading to a NULL pointer dereference. Fix it by having scdc functions access dw-hdmi's current connector pointer instead, which is assigned during the bridge enablement stage. Fixes: 5d844091f237 ("drm/scdc-helper: Pimp SCDC debugs") Signed-off-by: Adrián Larumbe Reported-by: Lukas F. Hartmann Acked-by: Neil Armstrong [narmstrong: moved Fixes tag before first S-o-b and added Reported-by tag] Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20230601123153.196867-1-adrian.larumbe@collabora.com --- drivers/gpu/drm/bridge/synopsys/dw-hdmi.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c index 603bb3c51027..3b40e0fdca5c 100644 --- a/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c +++ b/drivers/gpu/drm/bridge/synopsys/dw-hdmi.c @@ -1426,9 +1426,9 @@ void dw_hdmi_set_high_tmds_clock_ratio(struct dw_hdmi *hdmi, /* Control for TMDS Bit Period/TMDS Clock-Period Ratio */ if (dw_hdmi_support_scdc(hdmi, display)) { if (mtmdsclock > HDMI14_MAX_TMDSCLK) - drm_scdc_set_high_tmds_clock_ratio(&hdmi->connector, 1); + drm_scdc_set_high_tmds_clock_ratio(hdmi->curr_conn, 1); else - drm_scdc_set_high_tmds_clock_ratio(&hdmi->connector, 0); + drm_scdc_set_high_tmds_clock_ratio(hdmi->curr_conn, 0); } } EXPORT_SYMBOL_GPL(dw_hdmi_set_high_tmds_clock_ratio); @@ -2116,7 +2116,7 @@ static void hdmi_av_composer(struct dw_hdmi *hdmi, min_t(u8, bytes, SCDC_MIN_SOURCE_VERSION)); /* Enabled Scrambling in the Sink */ - drm_scdc_set_scrambling(&hdmi->connector, 1); + drm_scdc_set_scrambling(hdmi->curr_conn, 1); /* * To activate the scrambler feature, you must ensure @@ -2132,7 +2132,7 @@ static void hdmi_av_composer(struct dw_hdmi *hdmi, hdmi_writeb(hdmi, 0, HDMI_FC_SCRAMBLER_CTRL); hdmi_writeb(hdmi, (u8)~HDMI_MC_SWRSTZ_TMDSSWRST_REQ, HDMI_MC_SWRSTZ); - drm_scdc_set_scrambling(&hdmi->connector, 0); + drm_scdc_set_scrambling(hdmi->curr_conn, 0); } } @@ -3553,6 +3553,7 @@ struct dw_hdmi *dw_hdmi_probe(struct platform_device *pdev, hdmi->bridge.ops = DRM_BRIDGE_OP_DETECT | DRM_BRIDGE_OP_EDID | DRM_BRIDGE_OP_HPD; hdmi->bridge.interlace_allowed = true; + hdmi->bridge.ddc = hdmi->ddc; #ifdef CONFIG_OF hdmi->bridge.of_node = pdev->dev.of_node; #endif -- cgit v1.2.3 From 4481913607e58196c48a4fef5e6f45350684ec3c Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Thu, 22 Jun 2023 10:18:03 -0400 Subject: drm/ttm: fix bulk_move corruption when adding a entry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the resource is the first in the bulk_move range, adding it again (thus moving it to the tail) will corrupt the list since the first pointer is not moved. This eventually lead to null pointer deref in ttm_lru_bulk_move_del() Fixes: fee2ede15542 ("drm/ttm: rework bulk move handling v5") Signed-off-by: Yunxiang Li Reviewed-by: Christian König CC: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20230622141902.28718-3-Yunxiang.Li@amd.com Signed-off-by: Christian König --- drivers/gpu/drm/ttm/ttm_resource.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index 7333f7a87a2f..e51dbc7a2d53 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -86,6 +86,8 @@ static void ttm_lru_bulk_move_pos_tail(struct ttm_lru_bulk_move_pos *pos, struct ttm_resource *res) { if (pos->last != res) { + if (pos->first == res) + pos->first = list_next_entry(res, lru); list_move(&res->lru, &pos->last->lru); pos->last = res; } @@ -111,7 +113,8 @@ static void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk, { struct ttm_lru_bulk_move_pos *pos = ttm_lru_bulk_move_pos(bulk, res); - if (unlikely(pos->first == res && pos->last == res)) { + if (unlikely(WARN_ON(!pos->first || !pos->last) || + pos->first == res && pos->last == res)) { pos->first = NULL; pos->last = NULL; } else if (pos->first == res) { -- cgit v1.2.3 From 0c3855ba8dad41c4113e73f77eb926e44577e4af Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 23 Jun 2023 09:08:00 +0200 Subject: drm/ttm: fix warning that we shouldn't mix && and || MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trivial warning fix. Signed-off-by: Christian König Fixes: 4481913607e5 ("drm/ttm: fix bulk_move corruption when adding a entry") Reviewed-by: Alex Deucher Link: https://patchwork.freedesktop.org/patch/msgid/20230623070935.65102-1-christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_resource.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_resource.c b/drivers/gpu/drm/ttm/ttm_resource.c index e51dbc7a2d53..46ff9c75bb12 100644 --- a/drivers/gpu/drm/ttm/ttm_resource.c +++ b/drivers/gpu/drm/ttm/ttm_resource.c @@ -114,7 +114,7 @@ static void ttm_lru_bulk_move_del(struct ttm_lru_bulk_move *bulk, struct ttm_lru_bulk_move_pos *pos = ttm_lru_bulk_move_pos(bulk, res); if (unlikely(WARN_ON(!pos->first || !pos->last) || - pos->first == res && pos->last == res)) { + (pos->first == res && pos->last == res))) { pos->first = NULL; pos->last = NULL; } else if (pos->first == res) { -- cgit v1.2.3 From 8fb3e25c3dd1a2755c848ce7488c2f06a9fb9f97 Mon Sep 17 00:00:00 2001 From: Lyude Paul Date: Tue, 13 Jun 2023 17:05:28 -0400 Subject: drm/nouveau/kms/nv50-: Fix drm_dp_remove_payload() invocation We changed the semantics for this in: commit e761cc20946a ("drm/display/dp_mst: Handle old/new payload states in drm_dp_remove_payload()") But I totally forgot to update this properly in nouveau. So, let's do that. Signed-off-by: Lyude Paul Reviewed-by: Karol Herbst Link: https://patchwork.freedesktop.org/patch/msgid/20230613210529.552098-1-lyude@redhat.com --- drivers/gpu/drm/nouveau/dispnv50/disp.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/nouveau/dispnv50/disp.c b/drivers/gpu/drm/nouveau/dispnv50/disp.c index 5bb777ff1313..1637e08b548c 100644 --- a/drivers/gpu/drm/nouveau/dispnv50/disp.c +++ b/drivers/gpu/drm/nouveau/dispnv50/disp.c @@ -909,15 +909,19 @@ nv50_msto_prepare(struct drm_atomic_state *state, struct nouveau_drm *drm = nouveau_drm(msto->encoder.dev); struct nv50_mstc *mstc = msto->mstc; struct nv50_mstm *mstm = mstc->mstm; - struct drm_dp_mst_atomic_payload *payload; + struct drm_dp_mst_topology_state *old_mst_state; + struct drm_dp_mst_atomic_payload *payload, *old_payload; NV_ATOMIC(drm, "%s: msto prepare\n", msto->encoder.name); + old_mst_state = drm_atomic_get_old_mst_topology_state(state, mgr); + payload = drm_atomic_get_mst_payload_state(mst_state, mstc->port); + old_payload = drm_atomic_get_mst_payload_state(old_mst_state, mstc->port); // TODO: Figure out if we want to do a better job of handling VCPI allocation failures here? if (msto->disabled) { - drm_dp_remove_payload(mgr, mst_state, payload, payload); + drm_dp_remove_payload(mgr, mst_state, old_payload, payload); nvif_outp_dp_mst_vcpi(&mstm->outp->outp, msto->head->base.index, 0, 0, 0, 0); } else { -- cgit v1.2.3 From db8b4968a8d0e86c0f8bd7541359a4111a5b39ad Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Fri, 23 Jun 2023 09:52:04 +0200 Subject: drm/sched: Call drm_sched_fence_set_parent() from drm_sched_fence_scheduled() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Drivers that can delegate waits to the firmware/GPU pass the scheduled fence to drm_sched_job_add_dependency(), and issue wait commands to the firmware/GPU at job submission time. For this to be possible, they need all their 'native' dependencies to have a valid parent since this is where the actual HW fence information are encoded. In drm_sched_main(), we currently call drm_sched_fence_set_parent() after drm_sched_fence_scheduled(), leaving a short period of time during which the job depending on this fence can be submitted. Since setting parent and signaling the fence are two things that are kinda related (you can't have a parent if the job hasn't been scheduled), it probably makes sense to pass the parent fence to drm_sched_fence_scheduled() and let it call drm_sched_fence_set_parent() before it signals the scheduled fence. Here is a detailed description of the race we are fixing here: Thread A Thread B - calls drm_sched_fence_scheduled() - signals s_fence->scheduled which wakes up thread B - entity dep signaled, checking the next dep - no more deps waiting - entity is picked for job submission by drm_gpu_scheduler - run_job() is called - run_job() tries to collect native fence info from s_fence->parent, but it's NULL => BOOM, we can't do our native wait - calls drm_sched_fence_set_parent() v2: * Fix commit message v3: * Add a detailed description of the race to the commit message * Add Luben's R-b Signed-off-by: Boris Brezillon Cc: Frank Binns Cc: Sarah Walker Cc: Donald Robson Cc: Luben Tuikov Cc: David Airlie Cc: Daniel Vetter Cc: Sumit Semwal Cc: "Christian König" Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20230623075204.382350-1-boris.brezillon@collabora.com --- drivers/gpu/drm/scheduler/sched_fence.c | 40 ++++++++++++++++++++------------- drivers/gpu/drm/scheduler/sched_main.c | 3 +-- include/drm/gpu_scheduler.h | 5 ++--- 3 files changed, 28 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/scheduler/sched_fence.c b/drivers/gpu/drm/scheduler/sched_fence.c index fe9c6468e440..b6e70ddb4ee5 100644 --- a/drivers/gpu/drm/scheduler/sched_fence.c +++ b/drivers/gpu/drm/scheduler/sched_fence.c @@ -48,8 +48,32 @@ static void __exit drm_sched_fence_slab_fini(void) kmem_cache_destroy(sched_fence_slab); } -void drm_sched_fence_scheduled(struct drm_sched_fence *fence) +static void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence, + struct dma_fence *fence) { + /* + * smp_store_release() to ensure another thread racing us + * in drm_sched_fence_set_deadline_finished() sees the + * fence's parent set before test_bit() + */ + smp_store_release(&s_fence->parent, dma_fence_get(fence)); + if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, + &s_fence->finished.flags)) + dma_fence_set_deadline(fence, s_fence->deadline); +} + +void drm_sched_fence_scheduled(struct drm_sched_fence *fence, + struct dma_fence *parent) +{ + /* Set the parent before signaling the scheduled fence, such that, + * any waiter expecting the parent to be filled after the job has + * been scheduled (which is the case for drivers delegating waits + * to some firmware) doesn't have to busy wait for parent to show + * up. + */ + if (!IS_ERR_OR_NULL(parent)) + drm_sched_fence_set_parent(fence, parent); + dma_fence_signal(&fence->scheduled); } @@ -179,20 +203,6 @@ struct drm_sched_fence *to_drm_sched_fence(struct dma_fence *f) } EXPORT_SYMBOL(to_drm_sched_fence); -void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence, - struct dma_fence *fence) -{ - /* - * smp_store_release() to ensure another thread racing us - * in drm_sched_fence_set_deadline_finished() sees the - * fence's parent set before test_bit() - */ - smp_store_release(&s_fence->parent, dma_fence_get(fence)); - if (test_bit(DRM_SCHED_FENCE_FLAG_HAS_DEADLINE_BIT, - &s_fence->finished.flags)) - dma_fence_set_deadline(fence, s_fence->deadline); -} - struct drm_sched_fence *drm_sched_fence_alloc(struct drm_sched_entity *entity, void *owner) { diff --git a/drivers/gpu/drm/scheduler/sched_main.c b/drivers/gpu/drm/scheduler/sched_main.c index aea5a90ff98b..eff0a7f42f69 100644 --- a/drivers/gpu/drm/scheduler/sched_main.c +++ b/drivers/gpu/drm/scheduler/sched_main.c @@ -1040,10 +1040,9 @@ static int drm_sched_main(void *param) trace_drm_run_job(sched_job, entity); fence = sched->ops->run_job(sched_job); complete_all(&entity->entity_idle); - drm_sched_fence_scheduled(s_fence); + drm_sched_fence_scheduled(s_fence, fence); if (!IS_ERR_OR_NULL(fence)) { - drm_sched_fence_set_parent(s_fence, fence); /* Drop for original kref_init of the fence */ dma_fence_put(fence); diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h index c0586d832260..b29e347b10a9 100644 --- a/include/drm/gpu_scheduler.h +++ b/include/drm/gpu_scheduler.h @@ -582,15 +582,14 @@ void drm_sched_entity_set_priority(struct drm_sched_entity *entity, enum drm_sched_priority priority); bool drm_sched_entity_is_ready(struct drm_sched_entity *entity); -void drm_sched_fence_set_parent(struct drm_sched_fence *s_fence, - struct dma_fence *fence); struct drm_sched_fence *drm_sched_fence_alloc( struct drm_sched_entity *s_entity, void *owner); void drm_sched_fence_init(struct drm_sched_fence *fence, struct drm_sched_entity *entity); void drm_sched_fence_free(struct drm_sched_fence *fence); -void drm_sched_fence_scheduled(struct drm_sched_fence *fence); +void drm_sched_fence_scheduled(struct drm_sched_fence *fence, + struct dma_fence *parent); void drm_sched_fence_finished(struct drm_sched_fence *fence); unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); -- cgit v1.2.3 From 7aa83fbd712a6f08ffa67890061f26d140c2a84f Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Tue, 13 Jun 2023 06:58:13 -0700 Subject: drm/bridge: ti-sn65dsi86: Fix auxiliary bus lifetime Memory for the "struct device" for any given device isn't supposed to be released until the device's release() is called. This is important because someone might be holding a kobject reference to the "struct device" and might try to access one of its members even after any other cleanup/uninitialization has happened. Code analysis of ti-sn65dsi86 shows that this isn't quite right. When the code was written, it was believed that we could rely on the fact that the child devices would all be freed before the parent devices and thus we didn't need to worry about a release() function. While I still believe that the parent's "struct device" is guaranteed to outlive the child's "struct device" (because the child holds a kobject reference to the parent), the parent's "devm" allocated memory is a different story. That appears to be freed much earlier. Let's make this better for ti-sn65dsi86 by allocating each auxiliary with kzalloc and then free that memory in the release(). Fixes: bf73537f411b ("drm/bridge: ti-sn65dsi86: Break GPIO and MIPI-to-eDP bridge into sub-drivers") Suggested-by: Stephen Boyd Reviewed-by: Stephen Boyd Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20230613065812.v2.1.I24b838a5b4151fb32bccd6f36397998ea2df9fbb@changeid --- drivers/gpu/drm/bridge/ti-sn65dsi86.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/drivers/gpu/drm/bridge/ti-sn65dsi86.c b/drivers/gpu/drm/bridge/ti-sn65dsi86.c index 4676cf2900df..3c8fd6ea6d6a 100644 --- a/drivers/gpu/drm/bridge/ti-sn65dsi86.c +++ b/drivers/gpu/drm/bridge/ti-sn65dsi86.c @@ -170,10 +170,10 @@ * @pwm_refclk_freq: Cache for the reference clock input to the PWM. */ struct ti_sn65dsi86 { - struct auxiliary_device bridge_aux; - struct auxiliary_device gpio_aux; - struct auxiliary_device aux_aux; - struct auxiliary_device pwm_aux; + struct auxiliary_device *bridge_aux; + struct auxiliary_device *gpio_aux; + struct auxiliary_device *aux_aux; + struct auxiliary_device *pwm_aux; struct device *dev; struct regmap *regmap; @@ -468,27 +468,34 @@ static void ti_sn65dsi86_delete_aux(void *data) auxiliary_device_delete(data); } -/* - * AUX bus docs say that a non-NULL release is mandatory, but it makes no - * sense for the model used here where all of the aux devices are allocated - * in the single shared structure. We'll use this noop as a workaround. - */ -static void ti_sn65dsi86_noop(struct device *dev) {} +static void ti_sn65dsi86_aux_device_release(struct device *dev) +{ + struct auxiliary_device *aux = container_of(dev, struct auxiliary_device, dev); + + kfree(aux); +} static int ti_sn65dsi86_add_aux_device(struct ti_sn65dsi86 *pdata, - struct auxiliary_device *aux, + struct auxiliary_device **aux_out, const char *name) { struct device *dev = pdata->dev; + struct auxiliary_device *aux; int ret; + aux = kzalloc(sizeof(*aux), GFP_KERNEL); + if (!aux) + return -ENOMEM; + aux->name = name; aux->dev.parent = dev; - aux->dev.release = ti_sn65dsi86_noop; + aux->dev.release = ti_sn65dsi86_aux_device_release; device_set_of_node_from_dev(&aux->dev, dev); ret = auxiliary_device_init(aux); - if (ret) + if (ret) { + kfree(aux); return ret; + } ret = devm_add_action_or_reset(dev, ti_sn65dsi86_uninit_aux, aux); if (ret) return ret; @@ -497,6 +504,8 @@ static int ti_sn65dsi86_add_aux_device(struct ti_sn65dsi86 *pdata, if (ret) return ret; ret = devm_add_action_or_reset(dev, ti_sn65dsi86_delete_aux, aux); + if (!ret) + *aux_out = aux; return ret; } -- cgit v1.2.3 From e8188c461ee015ba0b9ab2fc82dbd5ebca5a5532 Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Mon, 26 Jun 2023 11:14:49 +0200 Subject: drm/ttm: Don't leak a resource on eviction error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On eviction errors other than -EMULTIHOP we were leaking a resource. Fix. v2: - Avoid yet another goto (Andi Shyti) Fixes: 403797925768 ("drm/ttm: Fix multihop assert on eviction.") Cc: Andrey Grodzovsky Cc: Christian König Cc: Christian Koenig Cc: Huang Rui Cc: dri-devel@lists.freedesktop.org Cc: # v5.15+ Signed-off-by: Thomas Hellström Reviewed-by: Nirmoy Das #v1 Reviewed-by: Andi Shyti Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20230626091450.14757-4-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index bd5dae4d1624..d364639a2752 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -458,18 +458,18 @@ static int ttm_bo_evict(struct ttm_buffer_object *bo, goto out; } -bounce: - ret = ttm_bo_handle_move_mem(bo, evict_mem, true, ctx, &hop); - if (ret == -EMULTIHOP) { + do { + ret = ttm_bo_handle_move_mem(bo, evict_mem, true, ctx, &hop); + if (ret != -EMULTIHOP) + break; + ret = ttm_bo_bounce_temp_buffer(bo, &evict_mem, ctx, &hop); - if (ret) { - if (ret != -ERESTARTSYS && ret != -EINTR) - pr_err("Buffer eviction failed\n"); - ttm_resource_free(bo, &evict_mem); - goto out; - } - /* try and move to final place now. */ - goto bounce; + } while (!ret); + + if (ret) { + ttm_resource_free(bo, &evict_mem); + if (ret != -ERESTARTSYS && ret != -EINTR) + pr_err("Buffer eviction failed\n"); } out: return ret; -- cgit v1.2.3 From a590f03d8de7c4cb7ce4916dc7f2fd10711faabe Mon Sep 17 00:00:00 2001 From: Thomas Hellström Date: Mon, 26 Jun 2023 11:14:50 +0200 Subject: drm/ttm: Don't leak a resource on swapout move error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If moving the bo to system for swapout failed, we were leaking a resource. Fix. Fixes: bfa3357ef9ab ("drm/ttm: allocate resource object instead of embedding it v2") Cc: Christian König Cc: "Christian König" Cc: dri-devel@lists.freedesktop.org Cc: # v5.14+ Signed-off-by: Thomas Hellström Reviewed-by: Nirmoy Das Reviewed-by: Andi Shyti Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20230626091450.14757-5-thomas.hellstrom@linux.intel.com --- drivers/gpu/drm/ttm/ttm_bo.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index d364639a2752..1a1cfd675cc4 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1167,6 +1167,7 @@ int ttm_bo_swapout(struct ttm_buffer_object *bo, struct ttm_operation_ctx *ctx, ret = ttm_bo_handle_move_mem(bo, evict_mem, true, &ctx, &hop); if (unlikely(ret != 0)) { WARN(ret == -EMULTIHOP, "Unexpected multihop in swaput - likely driver bug.\n"); + ttm_resource_free(bo, &evict_mem); goto out; } } -- cgit v1.2.3 From 1c519980aced3da1fae37c1339cf43b24eccdee7 Mon Sep 17 00:00:00 2001 From: Marek Vasut Date: Thu, 15 Jun 2023 22:16:02 +0200 Subject: drm/panel: simple: Add Powertip PH800480T013 drm_display_mode flags Add missing drm_display_mode DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_NHSYNC flags. Those are used by various bridges in the pipeline to correctly configure its sync signals polarity. Fixes: d69de69f2be1 ("drm/panel: simple: Add Powertip PH800480T013 panel") Signed-off-by: Marek Vasut Reviewed-by: Sam Ravnborg Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20230615201602.565948-1-marex@denx.de --- drivers/gpu/drm/panel/panel-simple.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index 454a13cca43a..7e10ae9f8d5d 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -3110,6 +3110,7 @@ static const struct drm_display_mode powertip_ph800480t013_idf02_mode = { .vsync_start = 480 + 49, .vsync_end = 480 + 49 + 2, .vtotal = 480 + 49 + 2 + 22, + .flags = DRM_MODE_FLAG_NVSYNC | DRM_MODE_FLAG_NHSYNC, }; static const struct panel_desc powertip_ph800480t013_idf02 = { -- cgit v1.2.3 From f781f661e8c99b0cb34129f2e374234d61864e77 Mon Sep 17 00:00:00 2001 From: Christian König Date: Tue, 13 Jun 2023 10:09:20 +0200 Subject: dma-buf: keep the signaling time of merged fences v3 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Some Android CTS is testing if the signaling time keeps consistent during merges. v2: use the current time if the fence is still in the signaling path and the timestamp not yet available. v3: improve comment, fix one more case to use the correct timestamp Signed-off-by: Christian König Reviewed-by: Luben Tuikov Link: https://patchwork.freedesktop.org/patch/msgid/20230630120041.109216-1-christian.koenig@amd.com --- drivers/dma-buf/dma-fence-unwrap.c | 26 ++++++++++++++++++++++---- drivers/dma-buf/dma-fence.c | 5 +++-- drivers/gpu/drm/drm_syncobj.c | 2 +- include/linux/dma-fence.h | 2 +- 4 files changed, 27 insertions(+), 8 deletions(-) diff --git a/drivers/dma-buf/dma-fence-unwrap.c b/drivers/dma-buf/dma-fence-unwrap.c index 7002bca792ff..c625bb2b5d56 100644 --- a/drivers/dma-buf/dma-fence-unwrap.c +++ b/drivers/dma-buf/dma-fence-unwrap.c @@ -66,18 +66,36 @@ struct dma_fence *__dma_fence_unwrap_merge(unsigned int num_fences, { struct dma_fence_array *result; struct dma_fence *tmp, **array; + ktime_t timestamp; unsigned int i; size_t count; count = 0; + timestamp = ns_to_ktime(0); for (i = 0; i < num_fences; ++i) { - dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) - if (!dma_fence_is_signaled(tmp)) + dma_fence_unwrap_for_each(tmp, &iter[i], fences[i]) { + if (!dma_fence_is_signaled(tmp)) { ++count; + } else if (test_bit(DMA_FENCE_FLAG_TIMESTAMP_BIT, + &tmp->flags)) { + if (ktime_after(tmp->timestamp, timestamp)) + timestamp = tmp->timestamp; + } else { + /* + * Use the current time if the fence is + * currently signaling. + */ + timestamp = ktime_get(); + } + } } + /* + * If we couldn't find a pending fence just return a private signaled + * fence with the timestamp of the last signaled one. + */ if (count == 0) - return dma_fence_get_stub(); + return dma_fence_allocate_private_stub(timestamp); array = kmalloc_array(count, sizeof(*array), GFP_KERNEL); if (!array) @@ -138,7 +156,7 @@ restart: } while (tmp); if (count == 0) { - tmp = dma_fence_get_stub(); + tmp = dma_fence_allocate_private_stub(ktime_get()); goto return_tmp; } diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index f177c56269bb..ad076f208760 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -150,10 +150,11 @@ EXPORT_SYMBOL(dma_fence_get_stub); /** * dma_fence_allocate_private_stub - return a private, signaled fence + * @timestamp: timestamp when the fence was signaled * * Return a newly allocated and signaled stub fence. */ -struct dma_fence *dma_fence_allocate_private_stub(void) +struct dma_fence *dma_fence_allocate_private_stub(ktime_t timestamp) { struct dma_fence *fence; @@ -169,7 +170,7 @@ struct dma_fence *dma_fence_allocate_private_stub(void) set_bit(DMA_FENCE_FLAG_ENABLE_SIGNAL_BIT, &fence->flags); - dma_fence_signal(fence); + dma_fence_signal_timestamp(fence, timestamp); return fence; } diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 0c2be8360525..04589a35eb09 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -353,7 +353,7 @@ EXPORT_SYMBOL(drm_syncobj_replace_fence); */ static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj) { - struct dma_fence *fence = dma_fence_allocate_private_stub(); + struct dma_fence *fence = dma_fence_allocate_private_stub(ktime_get()); if (IS_ERR(fence)) return PTR_ERR(fence); diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h index d54b595a0fe0..0d678e9a7b24 100644 --- a/include/linux/dma-fence.h +++ b/include/linux/dma-fence.h @@ -606,7 +606,7 @@ static inline signed long dma_fence_wait(struct dma_fence *fence, bool intr) void dma_fence_set_deadline(struct dma_fence *fence, ktime_t deadline); struct dma_fence *dma_fence_get_stub(void); -struct dma_fence *dma_fence_allocate_private_stub(void); +struct dma_fence *dma_fence_allocate_private_stub(ktime_t timestamp); u64 dma_fence_context_alloc(unsigned num); extern const struct dma_fence_ops dma_fence_array_ops; -- cgit v1.2.3 From 020b527b556a35cf636015c1c3cbdfe7c7acd5f0 Mon Sep 17 00:00:00 2001 From: Karol Wachowski Date: Mon, 3 Jul 2023 10:07:24 +0200 Subject: accel/ivpu: Fix VPU register access in irq disable Incorrect REGB_WR32() macro was used to access VPUIP register. Use correct REGV_WR32(). Fixes: 35b137630f08 ("accel/ivpu: Introduce a new DRM driver for Intel VPU") Cc: stable@vger.kernel.org # 6.3.x Signed-off-by: Karol Wachowski Reviewed-by: Jacek Lawrynowicz Signed-off-by: Stanislaw Gruszka Link: https://patchwork.freedesktop.org/patch/msgid/20230703080725.2065635-1-stanislaw.gruszka@linux.intel.com --- drivers/accel/ivpu/ivpu_hw_mtl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_hw_mtl.c b/drivers/accel/ivpu/ivpu_hw_mtl.c index fef35422c6f0..3485be27138a 100644 --- a/drivers/accel/ivpu/ivpu_hw_mtl.c +++ b/drivers/accel/ivpu/ivpu_hw_mtl.c @@ -885,7 +885,7 @@ static void ivpu_hw_mtl_irq_disable(struct ivpu_device *vdev) REGB_WR32(MTL_BUTTRESS_GLOBAL_INT_MASK, 0x1); REGB_WR32(MTL_BUTTRESS_LOCAL_INT_MASK, BUTTRESS_IRQ_DISABLE_MASK); REGV_WR64(MTL_VPU_HOST_SS_ICB_ENABLE_0, 0x0ull); - REGB_WR32(MTL_VPU_HOST_SS_FW_SOC_IRQ_EN, 0x0); + REGV_WR32(MTL_VPU_HOST_SS_FW_SOC_IRQ_EN, 0x0); } static void ivpu_hw_mtl_irq_wdt_nce_handler(struct ivpu_device *vdev) -- cgit v1.2.3 From 7f34e01f77f811ecb2ef83e60301b38cf89af466 Mon Sep 17 00:00:00 2001 From: Karol Wachowski Date: Mon, 3 Jul 2023 10:07:25 +0200 Subject: accel/ivpu: Clear specific interrupt status bits on C0 MTL C0 stepping fixed issue related to butrress interrupt status clearing, to clear an interrupt status it is required to write 1 to specific status bit field. This allows to execute read, modify and write routine. Writing 0 will not clear the interrupt and will cause interrupt storm. Fixes: 35b137630f08 ("accel/ivpu: Introduce a new DRM driver for Intel VPU") Cc: stable@vger.kernel.org # 6.3.x Signed-off-by: Karol Wachowski Reviewed-by: Jacek Lawrynowicz Signed-off-by: Stanislaw Gruszka Link: https://patchwork.freedesktop.org/patch/msgid/20230703080725.2065635-2-stanislaw.gruszka@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.h | 1 + drivers/accel/ivpu/ivpu_hw_mtl.c | 18 ++++++++++++------ 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.h b/drivers/accel/ivpu/ivpu_drv.h index d3013fbd13b3..399dc5dcefd7 100644 --- a/drivers/accel/ivpu/ivpu_drv.h +++ b/drivers/accel/ivpu/ivpu_drv.h @@ -75,6 +75,7 @@ struct ivpu_wa_table { bool punit_disabled; bool clear_runtime_mem; bool d3hot_after_power_off; + bool interrupt_clear_with_0; }; struct ivpu_hw_info; diff --git a/drivers/accel/ivpu/ivpu_hw_mtl.c b/drivers/accel/ivpu/ivpu_hw_mtl.c index 3485be27138a..2a5dd3a5dc46 100644 --- a/drivers/accel/ivpu/ivpu_hw_mtl.c +++ b/drivers/accel/ivpu/ivpu_hw_mtl.c @@ -101,6 +101,9 @@ static void ivpu_hw_wa_init(struct ivpu_device *vdev) vdev->wa.punit_disabled = ivpu_is_fpga(vdev); vdev->wa.clear_runtime_mem = false; vdev->wa.d3hot_after_power_off = true; + + if (ivpu_device_id(vdev) == PCI_DEVICE_ID_MTL && ivpu_revision(vdev) < 4) + vdev->wa.interrupt_clear_with_0 = true; } static void ivpu_hw_timeouts_init(struct ivpu_device *vdev) @@ -973,12 +976,15 @@ static u32 ivpu_hw_mtl_irqb_handler(struct ivpu_device *vdev, int irq) schedule_recovery = true; } - /* - * Clear local interrupt status by writing 0 to all bits. - * This must be done after interrupts are cleared at the source. - * Writing 1 triggers an interrupt, so we can't perform read update write. - */ - REGB_WR32(MTL_BUTTRESS_INTERRUPT_STAT, 0x0); + /* This must be done after interrupts are cleared at the source. */ + if (IVPU_WA(interrupt_clear_with_0)) + /* + * Writing 1 triggers an interrupt, so we can't perform read update write. + * Clear local interrupt status by writing 0 to all bits. + */ + REGB_WR32(MTL_BUTTRESS_INTERRUPT_STAT, 0x0); + else + REGB_WR32(MTL_BUTTRESS_INTERRUPT_STAT, status); /* Re-enable global interrupt */ REGB_WR32(MTL_BUTTRESS_GLOBAL_INT_MASK, 0x0); -- cgit v1.2.3 From 7eb1e47696aa231b1a567846bbe3a1e1befe1854 Mon Sep 17 00:00:00 2001 From: Michael Schmitz Date: Wed, 5 Jul 2023 11:38:08 +1200 Subject: block/partition: fix signedness issue for Amiga partitions Making 'blk' sector_t (i.e. 64 bit if LBD support is active) fails the 'blk>0' test in the partition block loop if a value of (signed int) -1 is used to mark the end of the partition block list. Explicitly cast 'blk' to signed int to allow use of -1 to terminate the partition block linked list. Fixes: b6f3f28f604b ("block: add overflow checks for Amiga partition support") Reported-by: Christian Zigotzky Link: https://lore.kernel.org/r/024ce4fa-cc6d-50a2-9aae-3701d0ebf668@xenosoft.de Signed-off-by: Michael Schmitz Reviewed-by: Martin Steigerwald Tested-by: Christian Zigotzky Signed-off-by: Jens Axboe --- block/partitions/amiga.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index ed222b9c901b..506921095412 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -90,7 +90,7 @@ int amiga_partition(struct parsed_partitions *state) } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); - for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { + for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { /* Read in terms partition table understands */ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", -- cgit v1.2.3 From 2fb48d88e77f29bf9d278f25bcfe82cf59a0e09b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Fri, 9 Jun 2023 23:11:39 -0700 Subject: blk-crypto: use dynamic lock class for blk_crypto_profile::lock When a device-mapper device is passing through the inline encryption support of an underlying device, calls to blk_crypto_evict_key() take the blk_crypto_profile::lock of the device-mapper device, then take the blk_crypto_profile::lock of the underlying device (nested). This isn't a real deadlock, but it causes a lockdep report because there is only one lock class for all instances of this lock. Lockdep subclasses don't really work here because the hierarchy of block devices is dynamic and could have more than 2 levels. Instead, register a dynamic lock class for each blk_crypto_profile, and associate that with the lock. This avoids false-positive lockdep reports like the following: ============================================ WARNING: possible recursive locking detected 6.4.0-rc5 #2 Not tainted -------------------------------------------- fscryptctl/1421 is trying to acquire lock: ffffff80829ca418 (&profile->lock){++++}-{3:3}, at: __blk_crypto_evict_key+0x44/0x1c0 but task is already holding lock: ffffff8086b68ca8 (&profile->lock){++++}-{3:3}, at: __blk_crypto_evict_key+0xc8/0x1c0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&profile->lock); lock(&profile->lock); *** DEADLOCK *** May be due to missing lock nesting notation Fixes: 1b2628397058 ("block: Keyslot Manager for Inline Encryption") Reported-by: Bart Van Assche Signed-off-by: Eric Biggers Reviewed-by: Bart Van Assche Link: https://lore.kernel.org/r/20230610061139.212085-1-ebiggers@kernel.org Signed-off-by: Jens Axboe --- block/blk-crypto-profile.c | 12 ++++++++++-- include/linux/blk-crypto-profile.h | 1 + 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 2a67d3fb63e5..7fabc883e39f 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -79,7 +79,14 @@ int blk_crypto_profile_init(struct blk_crypto_profile *profile, unsigned int slot_hashtable_size; memset(profile, 0, sizeof(*profile)); - init_rwsem(&profile->lock); + + /* + * profile->lock of an underlying device can nest inside profile->lock + * of a device-mapper device, so use a dynamic lock class to avoid + * false-positive lockdep reports. + */ + lockdep_register_key(&profile->lockdep_key); + __init_rwsem(&profile->lock, "&profile->lock", &profile->lockdep_key); if (num_slots == 0) return 0; @@ -89,7 +96,7 @@ int blk_crypto_profile_init(struct blk_crypto_profile *profile, profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), GFP_KERNEL); if (!profile->slots) - return -ENOMEM; + goto err_destroy; profile->num_slots = num_slots; @@ -435,6 +442,7 @@ void blk_crypto_profile_destroy(struct blk_crypto_profile *profile) { if (!profile) return; + lockdep_unregister_key(&profile->lockdep_key); kvfree(profile->slot_hashtable); kvfree_sensitive(profile->slots, sizeof(profile->slots[0]) * profile->num_slots); diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h index e6802b69cdd6..90ab33cb5d0e 100644 --- a/include/linux/blk-crypto-profile.h +++ b/include/linux/blk-crypto-profile.h @@ -111,6 +111,7 @@ struct blk_crypto_profile { * keyslots while ensuring that they can't be changed concurrently. */ struct rw_semaphore lock; + struct lock_class_key lockdep_key; /* List of idle slots, with least recently used slot at front */ wait_queue_head_t idle_slots_wait_queue; -- cgit v1.2.3 From 0e881c0a4b6146b7e856735226208f48251facd8 Mon Sep 17 00:00:00 2001 From: Tuo Li Date: Fri, 30 Jun 2023 10:47:48 +0800 Subject: scsi: lpfc: Fix a possible data race in lpfc_unregister_fcf_rescan() The variable phba->fcf.fcf_flag is often protected by the lock phba->hbalock() when is accessed. Here is an example in lpfc_unregister_fcf_rescan(): spin_lock_irq(&phba->hbalock); phba->fcf.fcf_flag |= FCF_INIT_DISC; spin_unlock_irq(&phba->hbalock); However, in the same function, phba->fcf.fcf_flag is assigned with 0 without holding the lock, and thus can cause a data race: phba->fcf.fcf_flag = 0; To fix this possible data race, a lock and unlock pair is added when accessing the variable phba->fcf.fcf_flag. Reported-by: BassCheck Signed-off-by: Tuo Li Link: https://lore.kernel.org/r/20230630024748.1035993-1-islituo@gmail.com Reviewed-by: Justin Tee Reviewed-by: Laurence Oberman Signed-off-by: Martin K. Petersen --- drivers/scsi/lpfc/lpfc_hbadisc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 499849b58ee4..fdd7f69d87ef 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -6944,7 +6944,9 @@ lpfc_unregister_fcf_rescan(struct lpfc_hba *phba) if (rc) return; /* Reset HBA FCF states after successful unregister FCF */ + spin_lock_irq(&phba->hbalock); phba->fcf.fcf_flag = 0; + spin_unlock_irq(&phba->hbalock); phba->fcf.current_rec.flag = 0; /* -- cgit v1.2.3 From 134f66959cd0bc90a745f2eed4c10a0519d455c4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 26 Jun 2023 13:58:03 +0300 Subject: scsi: qla2xxx: Silence a static checker warning Smatch and Clang both complain that LOGIN_TEMPLATE_SIZE is more than sizeof(ha->plogi_els_payld.fl_csp). Smatch warning: drivers/scsi/qla2xxx/qla_iocb.c:3075 qla24xx_els_dcmd2_iocb() warn: '&ha->plogi_els_payld.fl_csp' sometimes too small '16' size = 112 Clang warning: include/linux/fortify-string.h:592:4: error: call to '__read_overflow2_field' declared with 'warning' attribute: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror,-Wattribute-warning] __read_overflow2_field(q_size_field, size); When I was reading this code I assumed the "- 4" meant that we were skipping the last 4 bytes but actually it turned out that we are skipping the first four bytes. I have re-written it remove the magic numbers, be more clear and silence the static checker warnings. Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/4aa0485e-766f-4b02-8d5d-c6781ea8f511@moroto.mountain Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_def.h | 1 - drivers/scsi/qla2xxx/qla_iocb.c | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_def.h b/drivers/scsi/qla2xxx/qla_def.h index 95a12b4e0484..892ceba51c23 100644 --- a/drivers/scsi/qla2xxx/qla_def.h +++ b/drivers/scsi/qla2xxx/qla_def.h @@ -4461,7 +4461,6 @@ struct qla_hw_data { /* n2n */ struct fc_els_flogi plogi_els_payld; -#define LOGIN_TEMPLATE_SIZE (sizeof(struct fc_els_flogi) - 4) void *swl; diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index a1675f056a5c..9c70c4e973ee 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -3073,7 +3073,8 @@ qla24xx_els_dcmd2_iocb(scsi_qla_host_t *vha, int els_opcode, memset(ptr, 0, sizeof(struct els_plogi_payload)); memset(resp_ptr, 0, sizeof(struct els_plogi_payload)); memcpy(elsio->u.els_plogi.els_plogi_pyld->data, - &ha->plogi_els_payld.fl_csp, LOGIN_TEMPLATE_SIZE); + (void *)&ha->plogi_els_payld + offsetof(struct fc_els_flogi, fl_csp), + sizeof(ha->plogi_els_payld) - offsetof(struct fc_els_flogi, fl_csp)); elsio->u.els_plogi.els_cmd = els_opcode; elsio->u.els_plogi.els_plogi_pyld->opcode = els_opcode; -- cgit v1.2.3 From e579b007eff3ff8d29d59d16214cd85fb9e573f7 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 26 Jun 2023 13:58:47 +0300 Subject: scsi: qla2xxx: Fix error code in qla2x00_start_sp() This should be negative -EAGAIN instead of positive. The callers treat non-zero error codes the same so it doesn't really impact runtime beyond some trivial differences to debug output. Fixes: 80676d054e5a ("scsi: qla2xxx: Fix session cleanup hang") Signed-off-by: Dan Carpenter Link: https://lore.kernel.org/r/49866d28-4cfe-47b0-842b-78f110e61aab@moroto.mountain Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_iocb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/qla2xxx/qla_iocb.c b/drivers/scsi/qla2xxx/qla_iocb.c index 9c70c4e973ee..730d8609276c 100644 --- a/drivers/scsi/qla2xxx/qla_iocb.c +++ b/drivers/scsi/qla2xxx/qla_iocb.c @@ -3912,7 +3912,7 @@ qla2x00_start_sp(srb_t *sp) pkt = __qla2x00_alloc_iocbs(sp->qpair, sp); if (!pkt) { - rval = EAGAIN; + rval = -EAGAIN; ql_log(ql_log_warn, vha, 0x700c, "qla2x00_alloc_iocbs failed.\n"); goto done; -- cgit v1.2.3 From b34c7dcaf311521f6a0edaffc157d115d386ed9c Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 27 Jun 2023 16:43:22 +0200 Subject: scsi: fnic: Use vmalloc_array() and vcalloc() Use vmalloc_array() and vcalloc() to protect against multiplication overflows. The changes were done using the following Coccinelle semantic patch: // @initialize:ocaml@ @@ let rename alloc = match alloc with "vmalloc" -> "vmalloc_array" | "vzalloc" -> "vcalloc" | _ -> failwith "unknown" @@ size_t e1,e2; constant C1, C2; expression E1, E2, COUNT, x1, x2, x3; typedef u8; typedef __u8; type t = {u8,__u8,char,unsigned char}; identifier alloc = {vmalloc,vzalloc}; fresh identifier realloc = script:ocaml(alloc) { rename alloc }; @@ ( alloc(x1*x2*x3) | alloc(C1 * C2) | alloc((sizeof(t)) * (COUNT), ...) | - alloc((e1) * (e2)) + realloc(e1, e2) | - alloc((e1) * (COUNT)) + realloc(COUNT, e1) | - alloc((E1) * (E2)) + realloc(E1, E2) ) // Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20230627144339.144478-8-Julia.Lawall@inria.fr Signed-off-by: Martin K. Petersen --- drivers/scsi/fnic/fnic_trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/fnic/fnic_trace.c b/drivers/scsi/fnic/fnic_trace.c index f3c3a26a1384..be0d7c57b242 100644 --- a/drivers/scsi/fnic/fnic_trace.c +++ b/drivers/scsi/fnic/fnic_trace.c @@ -465,7 +465,7 @@ int fnic_trace_buf_init(void) fnic_max_trace_entries = (trace_max_pages * PAGE_SIZE)/ FNIC_ENTRY_SIZE_BYTES; - fnic_trace_buf_p = (unsigned long)vzalloc(trace_max_pages * PAGE_SIZE); + fnic_trace_buf_p = (unsigned long)vcalloc(trace_max_pages, PAGE_SIZE); if (!fnic_trace_buf_p) { printk(KERN_ERR PFX "Failed to allocate memory " "for fnic_trace_buf_p\n"); -- cgit v1.2.3 From 04d91b783acf910ceda2af31fe969e4eb572110b Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 27 Jun 2023 16:43:39 +0200 Subject: scsi: qla2xxx: Use vmalloc_array() and vcalloc() Use vmalloc_array() and vcalloc() to protect against multiplication overflows. The changes were done using the following Coccinelle semantic patch: // @initialize:ocaml@ @@ let rename alloc = match alloc with "vmalloc" -> "vmalloc_array" | "vzalloc" -> "vcalloc" | _ -> failwith "unknown" @@ size_t e1,e2; constant C1, C2; expression E1, E2, COUNT, x1, x2, x3; typedef u8; typedef __u8; type t = {u8,__u8,char,unsigned char}; identifier alloc = {vmalloc,vzalloc}; fresh identifier realloc = script:ocaml(alloc) { rename alloc }; @@ ( alloc(x1*x2*x3) | alloc(C1 * C2) | alloc((sizeof(t)) * (COUNT), ...) | - alloc((e1) * (e2)) + realloc(e1, e2) | - alloc((e1) * (COUNT)) + realloc(COUNT, e1) | - alloc((E1) * (E2)) + realloc(E1, E2) ) // Signed-off-by: Julia Lawall Link: https://lore.kernel.org/r/20230627144339.144478-25-Julia.Lawall@inria.fr Signed-off-by: Martin K. Petersen --- drivers/scsi/qla2xxx/qla_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_init.c b/drivers/scsi/qla2xxx/qla_init.c index b22b0516da29..e2d51f68f747 100644 --- a/drivers/scsi/qla2xxx/qla_init.c +++ b/drivers/scsi/qla2xxx/qla_init.c @@ -8434,7 +8434,7 @@ qla24xx_load_risc_flash(scsi_qla_host_t *vha, uint32_t *srisc_addr, ql_dbg(ql_dbg_init, vha, 0x0163, "-> fwdt%u template allocate template %#x words...\n", j, risc_size); - fwdt->template = vmalloc(risc_size * sizeof(*dcode)); + fwdt->template = vmalloc_array(risc_size, sizeof(*dcode)); if (!fwdt->template) { ql_log(ql_log_warn, vha, 0x0164, "-> fwdt%u failed allocate template.\n", j); @@ -8689,7 +8689,7 @@ qla24xx_load_risc_blob(scsi_qla_host_t *vha, uint32_t *srisc_addr) ql_dbg(ql_dbg_init, vha, 0x0173, "-> fwdt%u template allocate template %#x words...\n", j, risc_size); - fwdt->template = vmalloc(risc_size * sizeof(*dcode)); + fwdt->template = vmalloc_array(risc_size, sizeof(*dcode)); if (!fwdt->template) { ql_log(ql_log_warn, vha, 0x0174, "-> fwdt%u failed allocate template.\n", j); -- cgit v1.2.3 From 23815df5af5790c6e99b6bb1ffd39d509d0a7bdb Mon Sep 17 00:00:00 2001 From: Maurizio Lombardi Date: Wed, 28 Jun 2023 17:06:38 +0200 Subject: scsi: scsi_debug: Remove dead code The ramdisk rwlocks are not used anymore. Fixes: 87c715dcde63 ("scsi: scsi_debug: Add per_host_store option") Signed-off-by: Maurizio Lombardi Link: https://lore.kernel.org/r/20230628150638.53218-1-mlombard@redhat.com Reviewed-by: Laurence Oberman Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_debug.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/drivers/scsi/scsi_debug.c b/drivers/scsi/scsi_debug.c index 8c58128ad32a..9c0af50501f9 100644 --- a/drivers/scsi/scsi_debug.c +++ b/drivers/scsi/scsi_debug.c @@ -841,11 +841,6 @@ static int sdeb_zbc_nr_conv = DEF_ZBC_NR_CONV_ZONES; static int submit_queues = DEF_SUBMIT_QUEUES; /* > 1 for multi-queue (mq) */ static int poll_queues; /* iouring iopoll interface.*/ -static DEFINE_RWLOCK(atomic_rw); -static DEFINE_RWLOCK(atomic_rw2); - -static rwlock_t *ramdisk_lck_a[2]; - static char sdebug_proc_name[] = MY_NAME; static const char *my_name = MY_NAME; @@ -6818,9 +6813,6 @@ static int __init scsi_debug_init(void) int k, ret, hosts_to_add; int idx = -1; - ramdisk_lck_a[0] = &atomic_rw; - ramdisk_lck_a[1] = &atomic_rw2; - if (sdebug_ndelay >= 1000 * 1000 * 1000) { pr_warn("ndelay must be less than 1 second, ignored\n"); sdebug_ndelay = 0; -- cgit v1.2.3 From ef470b862dc7e5e169167301b9862bed54d7f969 Mon Sep 17 00:00:00 2001 From: Can Guo Date: Thu, 29 Jun 2023 17:39:09 -0700 Subject: scsi: ufs: core: Update contact email for monitor sysfs nodes Update contact email addresses for Can Guo and Asutosh Das, replace Subhash Jadavani's contact. Signed-off-by: Can Guo Link: https://lore.kernel.org/r/20230630003913.3713155-1-quic_cang@quicinc.com Signed-off-by: Martin K. Petersen --- Documentation/ABI/testing/sysfs-driver-ufs | 76 +++++++++++++++--------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-driver-ufs b/Documentation/ABI/testing/sysfs-driver-ufs index d5f44fc5b9dc..e487f969a15e 100644 --- a/Documentation/ABI/testing/sysfs-driver-ufs +++ b/Documentation/ABI/testing/sysfs-driver-ufs @@ -994,7 +994,7 @@ Description: This file shows the amount of physical memory needed What: /sys/bus/platform/drivers/ufshcd/*/rpm_lvl What: /sys/bus/platform/devices/*.ufs/rpm_lvl Date: September 2014 -Contact: Subhash Jadavani +Contact: Can Guo Description: This entry could be used to set or show the UFS device runtime power management level. The current driver implementation supports 7 levels with next target states: @@ -1021,7 +1021,7 @@ Description: This entry could be used to set or show the UFS device What: /sys/bus/platform/drivers/ufshcd/*/rpm_target_dev_state What: /sys/bus/platform/devices/*.ufs/rpm_target_dev_state Date: February 2018 -Contact: Subhash Jadavani +Contact: Can Guo Description: This entry shows the target power mode of an UFS device for the chosen runtime power management level. @@ -1030,7 +1030,7 @@ Description: This entry shows the target power mode of an UFS device What: /sys/bus/platform/drivers/ufshcd/*/rpm_target_link_state What: /sys/bus/platform/devices/*.ufs/rpm_target_link_state Date: February 2018 -Contact: Subhash Jadavani +Contact: Can Guo Description: This entry shows the target state of an UFS UIC link for the chosen runtime power management level. @@ -1039,7 +1039,7 @@ Description: This entry shows the target state of an UFS UIC link What: /sys/bus/platform/drivers/ufshcd/*/spm_lvl What: /sys/bus/platform/devices/*.ufs/spm_lvl Date: September 2014 -Contact: Subhash Jadavani +Contact: Can Guo Description: This entry could be used to set or show the UFS device system power management level. The current driver implementation supports 7 levels with next target states: @@ -1066,7 +1066,7 @@ Description: This entry could be used to set or show the UFS device What: /sys/bus/platform/drivers/ufshcd/*/spm_target_dev_state What: /sys/bus/platform/devices/*.ufs/spm_target_dev_state Date: February 2018 -Contact: Subhash Jadavani +Contact: Can Guo Description: This entry shows the target power mode of an UFS device for the chosen system power management level. @@ -1075,7 +1075,7 @@ Description: This entry shows the target power mode of an UFS device What: /sys/bus/platform/drivers/ufshcd/*/spm_target_link_state What: /sys/bus/platform/devices/*.ufs/spm_target_link_state Date: February 2018 -Contact: Subhash Jadavani +Contact: Can Guo Description: This entry shows the target state of an UFS UIC link for the chosen system power management level. @@ -1084,7 +1084,7 @@ Description: This entry shows the target state of an UFS UIC link What: /sys/bus/platform/drivers/ufshcd/*/monitor/monitor_enable What: /sys/bus/platform/devices/*.ufs/monitor/monitor_enable Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the status of performance monitor enablement and it can be used to start/stop the monitor. When the monitor is stopped, the performance data collected is also cleared. @@ -1092,7 +1092,7 @@ Description: This file shows the status of performance monitor enablement What: /sys/bus/platform/drivers/ufshcd/*/monitor/monitor_chunk_size What: /sys/bus/platform/devices/*.ufs/monitor/monitor_chunk_size Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file tells the monitor to focus on requests transferring data of specific chunk size (in Bytes). 0 means any chunk size. It can only be changed when monitor is disabled. @@ -1100,7 +1100,7 @@ Description: This file tells the monitor to focus on requests transferring What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_total_sectors What: /sys/bus/platform/devices/*.ufs/monitor/read_total_sectors Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows how many sectors (in 512 Bytes) have been sent from device to host after monitor gets started. @@ -1109,7 +1109,7 @@ Description: This file shows how many sectors (in 512 Bytes) have been What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_total_busy What: /sys/bus/platform/devices/*.ufs/monitor/read_total_busy Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows how long (in micro seconds) has been spent sending data from device to host after monitor gets started. @@ -1118,7 +1118,7 @@ Description: This file shows how long (in micro seconds) has been spent What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_nr_requests What: /sys/bus/platform/devices/*.ufs/monitor/read_nr_requests Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows how many read requests have been sent after monitor gets started. @@ -1127,7 +1127,7 @@ Description: This file shows how many read requests have been sent after What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_max What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_max Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the maximum latency (in micro seconds) of read requests after monitor gets started. @@ -1136,7 +1136,7 @@ Description: This file shows the maximum latency (in micro seconds) of What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_min What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_min Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the minimum latency (in micro seconds) of read requests after monitor gets started. @@ -1145,7 +1145,7 @@ Description: This file shows the minimum latency (in micro seconds) of What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_avg What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_avg Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the average latency (in micro seconds) of read requests after monitor gets started. @@ -1154,7 +1154,7 @@ Description: This file shows the average latency (in micro seconds) of What: /sys/bus/platform/drivers/ufshcd/*/monitor/read_req_latency_sum What: /sys/bus/platform/devices/*.ufs/monitor/read_req_latency_sum Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the total latency (in micro seconds) of read requests sent after monitor gets started. @@ -1163,7 +1163,7 @@ Description: This file shows the total latency (in micro seconds) of What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_total_sectors What: /sys/bus/platform/devices/*.ufs/monitor/write_total_sectors Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows how many sectors (in 512 Bytes) have been sent from host to device after monitor gets started. @@ -1172,7 +1172,7 @@ Description: This file shows how many sectors (in 512 Bytes) have been sent What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_total_busy What: /sys/bus/platform/devices/*.ufs/monitor/write_total_busy Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows how long (in micro seconds) has been spent sending data from host to device after monitor gets started. @@ -1181,7 +1181,7 @@ Description: This file shows how long (in micro seconds) has been spent What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_nr_requests What: /sys/bus/platform/devices/*.ufs/monitor/write_nr_requests Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows how many write requests have been sent after monitor gets started. @@ -1190,7 +1190,7 @@ Description: This file shows how many write requests have been sent after What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_max What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_max Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the maximum latency (in micro seconds) of write requests after monitor gets started. @@ -1199,7 +1199,7 @@ Description: This file shows the maximum latency (in micro seconds) of write What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_min What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_min Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the minimum latency (in micro seconds) of write requests after monitor gets started. @@ -1208,7 +1208,7 @@ Description: This file shows the minimum latency (in micro seconds) of write What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_avg What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_avg Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the average latency (in micro seconds) of write requests after monitor gets started. @@ -1217,7 +1217,7 @@ Description: This file shows the average latency (in micro seconds) of write What: /sys/bus/platform/drivers/ufshcd/*/monitor/write_req_latency_sum What: /sys/bus/platform/devices/*.ufs/monitor/write_req_latency_sum Date: January 2021 -Contact: Can Guo +Contact: Can Guo Description: This file shows the total latency (in micro seconds) of write requests after monitor gets started. @@ -1226,7 +1226,7 @@ Description: This file shows the total latency (in micro seconds) of write What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/wb_presv_us_en What: /sys/bus/platform/devices/*.ufs/device_descriptor/wb_presv_us_en Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows if preserve user-space was configured The file is read only. @@ -1234,7 +1234,7 @@ Description: This entry shows if preserve user-space was configured What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/wb_shared_alloc_units What: /sys/bus/platform/devices/*.ufs/device_descriptor/wb_shared_alloc_units Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the shared allocated units of WB buffer The file is read only. @@ -1242,7 +1242,7 @@ Description: This entry shows the shared allocated units of WB buffer What: /sys/bus/platform/drivers/ufshcd/*/device_descriptor/wb_type What: /sys/bus/platform/devices/*.ufs/device_descriptor/wb_type Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the configured WB type. 0x1 for shared buffer mode. 0x0 for dedicated buffer mode. @@ -1251,7 +1251,7 @@ Description: This entry shows the configured WB type. What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_buff_cap_adj What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_buff_cap_adj Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the total user-space decrease in shared buffer mode. The value of this parameter is 3 for TLC NAND when SLC mode @@ -1262,7 +1262,7 @@ Description: This entry shows the total user-space decrease in shared What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_max_alloc_units What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_max_alloc_units Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the Maximum total WriteBooster Buffer size which is supported by the entire device. @@ -1271,7 +1271,7 @@ Description: This entry shows the Maximum total WriteBooster Buffer size What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_max_wb_luns What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_max_wb_luns Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the maximum number of luns that can support WriteBooster. @@ -1280,7 +1280,7 @@ Description: This entry shows the maximum number of luns that can support What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_sup_red_type What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_sup_red_type Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: The supportability of user space reduction mode and preserve user space mode. 00h: WriteBooster Buffer can be configured only in @@ -1295,7 +1295,7 @@ Description: The supportability of user space reduction mode What: /sys/bus/platform/drivers/ufshcd/*/geometry_descriptor/wb_sup_wb_type What: /sys/bus/platform/devices/*.ufs/geometry_descriptor/wb_sup_wb_type Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: The supportability of WriteBooster Buffer type. === ========================================================== @@ -1310,7 +1310,7 @@ Description: The supportability of WriteBooster Buffer type. What: /sys/bus/platform/drivers/ufshcd/*/flags/wb_enable What: /sys/bus/platform/devices/*.ufs/flags/wb_enable Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the status of WriteBooster. == ============================ @@ -1323,7 +1323,7 @@ Description: This entry shows the status of WriteBooster. What: /sys/bus/platform/drivers/ufshcd/*/flags/wb_flush_en What: /sys/bus/platform/devices/*.ufs/flags/wb_flush_en Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows if flush is enabled. == ================================= @@ -1336,7 +1336,7 @@ Description: This entry shows if flush is enabled. What: /sys/bus/platform/drivers/ufshcd/*/flags/wb_flush_during_h8 What: /sys/bus/platform/devices/*.ufs/flags/wb_flush_during_h8 Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: Flush WriteBooster Buffer during hibernate state. == ================================================= @@ -1351,7 +1351,7 @@ Description: Flush WriteBooster Buffer during hibernate state. What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_avail_buf What: /sys/bus/platform/devices/*.ufs/attributes/wb_avail_buf Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the amount of unused WriteBooster buffer available. @@ -1360,7 +1360,7 @@ Description: This entry shows the amount of unused WriteBooster buffer What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_cur_buf What: /sys/bus/platform/devices/*.ufs/attributes/wb_cur_buf Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the amount of unused current buffer. The file is read only. @@ -1368,7 +1368,7 @@ Description: This entry shows the amount of unused current buffer. What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_flush_status What: /sys/bus/platform/devices/*.ufs/attributes/wb_flush_status Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the flush operation status. @@ -1385,7 +1385,7 @@ Description: This entry shows the flush operation status. What: /sys/bus/platform/drivers/ufshcd/*/attributes/wb_life_time_est What: /sys/bus/platform/devices/*.ufs/attributes/wb_life_time_est Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows an indication of the WriteBooster Buffer lifetime based on the amount of performed program/erase cycles @@ -1399,7 +1399,7 @@ Description: This entry shows an indication of the WriteBooster Buffer What: /sys/class/scsi_device/*/device/unit_descriptor/wb_buf_alloc_units Date: June 2020 -Contact: Asutosh Das +Contact: Asutosh Das Description: This entry shows the configured size of WriteBooster buffer. 0400h corresponds to 4GB. -- cgit v1.2.3 From 89f7ef7f2b23b2a7b8ce346c23161916eae5b15c Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 30 Jun 2023 22:23:48 -0700 Subject: scsi: ufs: ufs-mediatek: Add dependency for RESET_CONTROLLER When RESET_CONTROLLER is not set, kconfig complains about missing dependencies for RESET_TI_SYSCON, so add the missing dependency just as is done above for SCSI_UFS_QCOM. Silences this kconfig warning: WARNING: unmet direct dependencies detected for RESET_TI_SYSCON Depends on [n]: RESET_CONTROLLER [=n] && HAS_IOMEM [=y] Selected by [m]: - SCSI_UFS_MEDIATEK [=m] && SCSI_UFSHCD [=y] && SCSI_UFSHCD_PLATFORM [=y] && ARCH_MEDIATEK [=y] Fixes: de48898d0cb6 ("scsi: ufs-mediatek: Create reset control device_link") Signed-off-by: Randy Dunlap Link: lore.kernel.org/r/202306020859.1wHg9AaT-lkp@intel.com Link: https://lore.kernel.org/r/20230701052348.28046-1-rdunlap@infradead.org Cc: Stanley Chu Cc: Peter Wang Cc: Paul Gazzillo Cc: Necip Fazil Yildiran Cc: linux-scsi@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-mediatek@lists.infradead.org Cc: "James E.J. Bottomley" Cc: "Martin K. Petersen" Reported-by: kernel test robot Signed-off-by: Martin K. Petersen --- drivers/ufs/host/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/ufs/host/Kconfig b/drivers/ufs/host/Kconfig index 16624ba08050..580c8d0bd8bb 100644 --- a/drivers/ufs/host/Kconfig +++ b/drivers/ufs/host/Kconfig @@ -72,6 +72,7 @@ config SCSI_UFS_QCOM config SCSI_UFS_MEDIATEK tristate "Mediatek specific hooks to UFS controller platform driver" depends on SCSI_UFSHCD_PLATFORM && ARCH_MEDIATEK + depends on RESET_CONTROLLER select PHY_MTK_UFS select RESET_TI_SYSCON help -- cgit v1.2.3 From 47699a2b63caaa0de4841d4402627c2fdf3452a6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Jul 2023 13:48:46 +0200 Subject: scsi: aacraid: Avoid -Warray-bounds warning The one-element array in aac_aifcmd is actually meant as a flexible array, and causes an overflow warning that can be avoided using the normal flex arrays: drivers/scsi/aacraid/commsup.c:1166:17: error: array index 1 is past the end of the array (that has type 'u8[1]' (aka 'unsigned char[1]'), cast to '__le32 *' (aka 'unsigned int *')) [-Werror,-Warray-bounds] (((__le32 *)aifcmd->data)[1] == cpu_to_le32(3)); ^ ~ Signed-off-by: Arnd Bergmann Link: https://lore.kernel.org/r/20230703114851.1194510-1-arnd@kernel.org Signed-off-by: Martin K. Petersen --- drivers/scsi/aacraid/aacraid.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/aacraid/aacraid.h b/drivers/scsi/aacraid/aacraid.h index 5e115e8b2ba4..7d5a155073c6 100644 --- a/drivers/scsi/aacraid/aacraid.h +++ b/drivers/scsi/aacraid/aacraid.h @@ -2617,7 +2617,7 @@ struct aac_hba_info { struct aac_aifcmd { __le32 command; /* Tell host what type of notify this is */ __le32 seqnum; /* To allow ordering of reports (if necessary) */ - u8 data[1]; /* Undefined length (from kernel viewpoint) */ + u8 data[]; /* Undefined length (from kernel viewpoint) */ }; /** -- cgit v1.2.3 From 24befa92ed47ed2af25868412a1275602ca8f4ea Mon Sep 17 00:00:00 2001 From: Arthur Simchaev Date: Mon, 26 Jun 2023 13:33:19 +0300 Subject: scsi: ufs: core: Add support for qTimestamp attribute The new qTimestamp attribute was added to UFS 4.0 spec, in order to synchronize timestamp between device logs and the host. The spec recommends to send this attribute upon device power-on Reset/HW reset or when switching to Active state (using SSU command). Due to this attribute, the attribute's max value was extended to 8 bytes. As a result, the new definition of struct utp_upiu_query_v4_0 was added. Signed-off-by: Arthur Simchaev ----------------- Changes to v2: - Adressed Bart's comments - Add missed response variable to ufshcd_set_timestamp_attr Link: https://lore.kernel.org/r/20230626103320.8737-1-arthur.simchaev@wdc.com Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 38 ++++++++++++++++++++++++++++++++++++++ include/uapi/scsi/scsi_bsg_ufs.h | 25 +++++++++++++++++++++++++ include/ufs/ufs.h | 1 + 3 files changed, 64 insertions(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 358b3240b6c5..e2812911e462 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -8520,6 +8520,41 @@ out: return ret; } +static void ufshcd_set_timestamp_attr(struct ufs_hba *hba) +{ + int err; + struct ufs_query_req *request = NULL; + struct ufs_query_res *response = NULL; + struct ufs_dev_info *dev_info = &hba->dev_info; + struct utp_upiu_query_v4_0 *upiu_data; + + if (dev_info->wspecversion < 0x400) + return; + + ufshcd_hold(hba); + + mutex_lock(&hba->dev_cmd.lock); + + ufshcd_init_query(hba, &request, &response, + UPIU_QUERY_OPCODE_WRITE_ATTR, + QUERY_ATTR_IDN_TIMESTAMP, 0, 0); + + request->query_func = UPIU_QUERY_FUNC_STANDARD_WRITE_REQUEST; + + upiu_data = (struct utp_upiu_query_v4_0 *)&request->upiu_req; + + put_unaligned_be64(ktime_get_real_ns(), &upiu_data->osf3); + + err = ufshcd_exec_dev_cmd(hba, DEV_CMD_TYPE_QUERY, QUERY_REQ_TIMEOUT); + + if (err) + dev_err(hba->dev, "%s: failed to set timestamp %d\n", + __func__, err); + + mutex_unlock(&hba->dev_cmd.lock); + ufshcd_release(hba); +} + /** * ufshcd_add_lus - probe and add UFS logical units * @hba: per-adapter instance @@ -8708,6 +8743,8 @@ static int ufshcd_device_init(struct ufs_hba *hba, bool init_dev_params) ufshcd_set_ufs_dev_active(hba); ufshcd_force_reset_auto_bkops(hba); + ufshcd_set_timestamp_attr(hba); + /* Gear up to HS gear if supported */ if (hba->max_pwr_info.is_valid) { /* @@ -9741,6 +9778,7 @@ static int __ufshcd_wl_resume(struct ufs_hba *hba, enum ufs_pm_op pm_op) ret = ufshcd_set_dev_pwr_mode(hba, UFS_ACTIVE_PWR_MODE); if (ret) goto set_old_link_state; + ufshcd_set_timestamp_attr(hba); } if (ufshcd_keep_autobkops_enabled_except_suspend(hba)) diff --git a/include/uapi/scsi/scsi_bsg_ufs.h b/include/uapi/scsi/scsi_bsg_ufs.h index 2801b65299aa..fd3f9e5ee241 100644 --- a/include/uapi/scsi/scsi_bsg_ufs.h +++ b/include/uapi/scsi/scsi_bsg_ufs.h @@ -70,6 +70,31 @@ struct utp_upiu_query { __be32 reserved[2]; }; +/** + * struct utp_upiu_query_v4_0 - upiu request buffer structure for + * query request >= UFS 4.0 spec. + * @opcode: command to perform B-0 + * @idn: a value that indicates the particular type of data B-1 + * @index: Index to further identify data B-2 + * @selector: Index to further identify data B-3 + * @osf4: spec field B-5 + * @osf5: spec field B 6,7 + * @osf6: spec field DW 8,9 + * @osf7: spec field DW 10,11 + */ +struct utp_upiu_query_v4_0 { + __u8 opcode; + __u8 idn; + __u8 index; + __u8 selector; + __u8 osf3; + __u8 osf4; + __be16 osf5; + __be32 osf6; + __be32 osf7; + __be32 reserved; +}; + /** * struct utp_upiu_cmd - Command UPIU structure * @data_transfer_len: Data Transfer Length DW-3 diff --git a/include/ufs/ufs.h b/include/ufs/ufs.h index 4e8d6240e589..198cb391f9db 100644 --- a/include/ufs/ufs.h +++ b/include/ufs/ufs.h @@ -170,6 +170,7 @@ enum attr_idn { QUERY_ATTR_IDN_WB_BUFF_LIFE_TIME_EST = 0x1E, QUERY_ATTR_IDN_CURR_WB_BUFF_SIZE = 0x1F, QUERY_ATTR_IDN_EXT_IID_EN = 0x2A, + QUERY_ATTR_IDN_TIMESTAMP = 0x30 }; /* Descriptor idn for Query requests */ -- cgit v1.2.3 From f79846ca2f04c9744627c24034d675c88f0da3a0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 3 Jul 2023 11:48:08 +0900 Subject: scsi: sd_zbc: Set zone limits before revalidating zones In sd_zbc_revalidate_zones(), execute blk_queue_chunk_sectors() and blk_queue_max_zone_append_sectors() to respectively set a ZBC device zone size and maximum zone append sector limit before executing blk_revalidate_disk_zones(). This is to allow the block layer zone reavlidation to check these device characteristics prior to checking all zones of the device. Since blk_queue_max_zone_append_sectors() already caps the device maximum zone append limit to the zone size and to the maximum command size, the max_append value passed to blk_queue_max_zone_append_sectors() is simplified to the maximum number of segments times the number of sectors per page. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230703024812.76778-2-dlemoal@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/scsi/sd_zbc.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index abbd08933ac7..a25215507668 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -831,7 +831,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) struct request_queue *q = disk->queue; u32 zone_blocks = sdkp->early_zone_info.zone_blocks; unsigned int nr_zones = sdkp->early_zone_info.nr_zones; - u32 max_append; int ret = 0; unsigned int flags; @@ -876,6 +875,11 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) goto unlock; } + blk_queue_chunk_sectors(q, + logical_to_sectors(sdkp->device, zone_blocks)); + blk_queue_max_zone_append_sectors(q, + q->limits.max_segments << PAGE_SECTORS_SHIFT); + ret = blk_revalidate_disk_zones(disk, sd_zbc_revalidate_zones_cb); memalloc_noio_restore(flags); @@ -888,12 +892,6 @@ int sd_zbc_revalidate_zones(struct scsi_disk *sdkp) goto unlock; } - max_append = min_t(u32, logical_to_sectors(sdkp->device, zone_blocks), - q->limits.max_segments << PAGE_SECTORS_SHIFT); - max_append = min_t(u32, max_append, queue_max_hw_sectors(q)); - - blk_queue_max_zone_append_sectors(q, max_append); - sd_zbc_print_zones(sdkp); unlock: -- cgit v1.2.3 From d226b0a2b683e27fce060669dc7cb8d6917b785a Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 3 Jul 2023 11:48:09 +0900 Subject: scsi: nvme: zns: Set zone limits before revalidating zones In nvme_revalidate_zones(), execute blk_queue_chunk_sectors() and blk_queue_max_zone_append_sectors() to respectively set a ZNS namespace zone size and maximum zone append sector limit before executing blk_revalidate_disk_zones(). This is to allow the block layer zone reavlidation to check these device characteristics prior to checking all zones of the device. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230703024812.76778-3-dlemoal@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/nvme/host/zns.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/zns.c b/drivers/nvme/host/zns.c index 12316ab51bda..ec8557810c21 100644 --- a/drivers/nvme/host/zns.c +++ b/drivers/nvme/host/zns.c @@ -10,12 +10,11 @@ int nvme_revalidate_zones(struct nvme_ns *ns) { struct request_queue *q = ns->queue; - int ret; - ret = blk_revalidate_disk_zones(ns->disk, NULL); - if (!ret) - blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); - return ret; + blk_queue_chunk_sectors(q, ns->zsze); + blk_queue_max_zone_append_sectors(q, ns->ctrl->max_zone_append); + + return blk_revalidate_disk_zones(ns->disk, NULL); } static int nvme_set_max_append(struct nvme_ctrl *ctrl) -- cgit v1.2.3 From a442b899fe17118e672647f664bf7f1dd5d7fcb0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 3 Jul 2023 11:48:10 +0900 Subject: scsi: block: nullblk: Set zone limits before revalidating zones In null_register_zoned_dev(), execute blk_queue_chunk_sectors() and blk_queue_max_zone_append_sectors() to respectively set the zoned device zone size and maximum zone append sector limit before executing blk_revalidate_disk_zones(). This is to allow the block layer zone reavlidation to check these device characteristics prior to checking all zones of the device. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230703024812.76778-4-dlemoal@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/block/null_blk/zoned.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 635ce0648133..55c5b48bc276 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -162,21 +162,15 @@ int null_register_zoned_dev(struct nullb *nullb) disk_set_zoned(nullb->disk, BLK_ZONED_HM); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); - - if (queue_is_mq(q)) { - int ret = blk_revalidate_disk_zones(nullb->disk, NULL); - - if (ret) - return ret; - } else { - blk_queue_chunk_sectors(q, dev->zone_size_sects); - nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); - } - + blk_queue_chunk_sectors(q, dev->zone_size_sects); + nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); disk_set_max_open_zones(nullb->disk, dev->zone_max_open); disk_set_max_active_zones(nullb->disk, dev->zone_max_active); + if (queue_is_mq(q)) + return blk_revalidate_disk_zones(nullb->disk, NULL); + return 0; } -- cgit v1.2.3 From a3d96ed21507e8d70ddab6c7abc93d5e56aaeeb0 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 3 Jul 2023 11:48:11 +0900 Subject: scsi: block: virtio_blk: Set zone limits before revalidating zones In virtblk_probe_zoned_device(), execute blk_queue_chunk_sectors() and blk_queue_max_zone_append_sectors() to respectively set the zoned device zone size and maximum zone append sector limit before executing blk_revalidate_disk_zones(). This is to allow the block layer zone reavlidation to check these device characteristics prior to checking all zones of the device. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230703024812.76778-5-dlemoal@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Dmitry Fomichev Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- drivers/block/virtio_blk.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 2b918e28acaa..d6df29566370 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -781,7 +781,6 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, { u32 v, wg; u8 model; - int ret; virtio_cread(vdev, struct virtio_blk_config, zoned.model, &model); @@ -836,6 +835,7 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, vblk->zone_sectors); return -ENODEV; } + blk_queue_chunk_sectors(q, vblk->zone_sectors); dev_dbg(&vdev->dev, "zone sectors = %u\n", vblk->zone_sectors); if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) { @@ -844,26 +844,22 @@ static int virtblk_probe_zoned_device(struct virtio_device *vdev, blk_queue_max_discard_sectors(q, 0); } - ret = blk_revalidate_disk_zones(vblk->disk, NULL); - if (!ret) { - virtio_cread(vdev, struct virtio_blk_config, - zoned.max_append_sectors, &v); - if (!v) { - dev_warn(&vdev->dev, "zero max_append_sectors reported\n"); - return -ENODEV; - } - if ((v << SECTOR_SHIFT) < wg) { - dev_err(&vdev->dev, - "write granularity %u exceeds max_append_sectors %u limit\n", - wg, v); - return -ENODEV; - } - - blk_queue_max_zone_append_sectors(q, v); - dev_dbg(&vdev->dev, "max append sectors = %u\n", v); + virtio_cread(vdev, struct virtio_blk_config, + zoned.max_append_sectors, &v); + if (!v) { + dev_warn(&vdev->dev, "zero max_append_sectors reported\n"); + return -ENODEV; + } + if ((v << SECTOR_SHIFT) < wg) { + dev_err(&vdev->dev, + "write granularity %u exceeds max_append_sectors %u limit\n", + wg, v); + return -ENODEV; } + blk_queue_max_zone_append_sectors(q, v); + dev_dbg(&vdev->dev, "max append sectors = %u\n", v); - return ret; + return blk_revalidate_disk_zones(vblk->disk, NULL); } #else -- cgit v1.2.3 From 03e51c4a74b91b0b1a9ca091029b0b58f014be81 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 3 Jul 2023 11:48:12 +0900 Subject: scsi: block: Improve checks in blk_revalidate_disk_zones() blk_revalidate_disk_zones() implements checks of the zones of a zoned block device, verifying that the zone size is a power of 2 number of sectors, that all zones (except possibly the last one) have the same size and that zones cover the entire addressing space of the device. While these checks are appropriate to verify that well tested hardware devices have an adequate zone configurations, they lack in certain areas which may result in issues with emulated devices implemented with user drivers such as ublk or tcmu. Specifically, this function does not check if the device driver indicated support for the mandatory zone append writes, that is, if the device max_zone_append_sectors queue limit is set to a non-zero value. Additionally, invalid zones such as a zero length zone with a start sector equal to the device capacity will not be detected and result in out of bounds use of the zone bitmaps prepared with the callback function blk_revalidate_zone_cb(). Improve blk_revalidate_disk_zones() to address these inadequate checks, relying on the fact that all device drivers supporting zoned block devices must set the device zone size (chunk_sectors queue limit) and the max_zone_append_sectors queue limit before executing this function. The check for a non-zero max_zone_append_sectors value is done in blk_revalidate_disk_zones() before executing the zone report. The zone report callback function blk_revalidate_zone_cb() is also modified to add a check that a zone start is below the device capacity. The check that the zone size is a power of 2 number of sectors is moved to blk_revalidate_disk_zones() as the zone size is already known. Similarly, the number of zones of the device can be calculated in blk_revalidate_disk_zones() before executing the zone report. The kdoc comment for blk_revalidate_disk_zones() is also updated to mention that device drivers must set the device zone size and the max_zone_append_sectors queue limit before calling this function. Signed-off-by: Damien Le Moal Link: https://lore.kernel.org/r/20230703024812.76778-6-dlemoal@kernel.org Reviewed-by: Bart Van Assche Reviewed-by: Johannes Thumshirn Signed-off-by: Martin K. Petersen --- block/blk-zoned.c | 86 ++++++++++++++++++++++++++++++++----------------------- 1 file changed, 50 insertions(+), 36 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index fce9082384d6..da92ce0c5da9 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -448,7 +448,6 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int nr_zones; - sector_t zone_sectors; sector_t sector; }; @@ -462,38 +461,34 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, struct gendisk *disk = args->disk; struct request_queue *q = disk->queue; sector_t capacity = get_capacity(disk); + sector_t zone_sectors = q->limits.chunk_sectors; + + /* Check for bad zones and holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } + + if (zone->start >= capacity || !zone->len) { + pr_warn("%s: Invalid zone start %llu, length %llu\n", + disk->disk_name, zone->start, zone->len); + return -ENODEV; + } /* * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start == 0) { - if (zone->len == 0 || !is_power_of_2(zone->len)) { - pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", - disk->disk_name, zone->len); - return -ENODEV; - } - - args->zone_sectors = zone->len; - args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); - } else if (zone->start + args->zone_sectors < capacity) { - if (zone->len != args->zone_sectors) { + if (zone->start + zone->len < capacity) { + if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); return -ENODEV; } - } else { - if (zone->len > args->zone_sectors) { - pr_warn("%s: Invalid zoned device with larger last zone size\n", - disk->disk_name); - return -ENODEV; - } - } - - /* Check for holes in the zone report */ - if (zone->start != args->sector) { - pr_warn("%s: Zone gap at sectors %llu..%llu\n", - disk->disk_name, args->sector, zone->start); + } else if (zone->len > zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); return -ENODEV; } @@ -532,11 +527,13 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * @disk: Target disk * @update_driver_data: Callback to update driver data on the frozen disk * - * Helper function for low-level device drivers to (re) allocate and initialize - * a disk request queue zone bitmaps. This functions should normally be called - * within the disk ->revalidate method for blk-mq based drivers. For BIO based - * drivers only q->nr_zones needs to be updated so that the sysfs exposed value - * is correct. + * Helper function for low-level device drivers to check and (re) allocate and + * initialize a disk request queue zone bitmaps. This functions should normally + * be called within the disk ->revalidate method for blk-mq based drivers. + * Before calling this function, the device driver must already have set the + * device zone size (chunk_sector limit) and the max zone append limit. + * For BIO based drivers, this function cannot be used. BIO based device drivers + * only need to set disk->nr_zones so that the sysfs exposed value is correct. * If the @update_driver_data callback function is not NULL, the callback is * executed with the device request queue frozen after all zones have been * checked. @@ -545,9 +542,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)) { struct request_queue *q = disk->queue; - struct blk_revalidate_zone_args args = { - .disk = disk, - }; + sector_t zone_sectors = q->limits.chunk_sectors; + sector_t capacity = get_capacity(disk); + struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; int ret; @@ -556,13 +553,31 @@ int blk_revalidate_disk_zones(struct gendisk *disk, if (WARN_ON_ONCE(!queue_is_mq(q))) return -EIO; - if (!get_capacity(disk)) - return -EIO; + if (!capacity) + return -ENODEV; + + /* + * Checks that the device driver indicated a valid zone size and that + * the max zone append limit is set. + */ + if (!zone_sectors || !is_power_of_2(zone_sectors)) { + pr_warn("%s: Invalid non power of two zone size (%llu)\n", + disk->disk_name, zone_sectors); + return -ENODEV; + } + + if (!q->limits.max_zone_append_sectors) { + pr_warn("%s: Invalid 0 maximum zone append limit\n", + disk->disk_name); + return -ENODEV; + } /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ + args.disk = disk; + args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); @@ -576,7 +591,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, * If zones where reported, make sure that the entire disk capacity * has been checked. */ - if (ret > 0 && args.sector != get_capacity(disk)) { + if (ret > 0 && args.sector != capacity) { pr_warn("%s: Missing zones from sector %llu\n", disk->disk_name, args.sector); ret = -ENODEV; @@ -589,7 +604,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk, */ blk_mq_freeze_queue(q); if (ret > 0) { - blk_queue_chunk_sectors(q, args.zone_sectors); disk->nr_zones = args.nr_zones; swap(disk->seq_zones_wlock, args.seq_zones_wlock); swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); -- cgit v1.2.3 From 00ae1491f970acc454be0df63f50942d94825860 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Jul 2023 15:37:51 +0300 Subject: dma-buf: fix an error pointer vs NULL bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Smatch detected potential error pointer dereference. drivers/gpu/drm/drm_syncobj.c:888 drm_syncobj_transfer_to_timeline() error: 'fence' dereferencing possible ERR_PTR() The error pointer comes from dma_fence_allocate_private_stub(). One caller expected error pointers and one expected NULL pointers. Change it to return NULL and update the caller which expected error pointers, drm_syncobj_assign_null_handle(), to check for NULL instead. Fixes: f781f661e8c9 ("dma-buf: keep the signaling time of merged fences v3") Signed-off-by: Dan Carpenter Reviewed-by: Christian König Reviewed-by: Sumit Semwal Signed-off-by: Sumit Semwal Link: https://patchwork.freedesktop.org/patch/msgid/b09f1996-3838-4fa2-9193-832b68262e43@moroto.mountain --- drivers/dma-buf/dma-fence.c | 2 +- drivers/gpu/drm/drm_syncobj.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c index ad076f208760..8aa8f8cb7071 100644 --- a/drivers/dma-buf/dma-fence.c +++ b/drivers/dma-buf/dma-fence.c @@ -160,7 +160,7 @@ struct dma_fence *dma_fence_allocate_private_stub(ktime_t timestamp) fence = kzalloc(sizeof(*fence), GFP_KERNEL); if (fence == NULL) - return ERR_PTR(-ENOMEM); + return NULL; dma_fence_init(fence, &dma_fence_stub_ops, diff --git a/drivers/gpu/drm/drm_syncobj.c b/drivers/gpu/drm/drm_syncobj.c index 04589a35eb09..e592c5da70ce 100644 --- a/drivers/gpu/drm/drm_syncobj.c +++ b/drivers/gpu/drm/drm_syncobj.c @@ -355,8 +355,8 @@ static int drm_syncobj_assign_null_handle(struct drm_syncobj *syncobj) { struct dma_fence *fence = dma_fence_allocate_private_stub(ktime_get()); - if (IS_ERR(fence)) - return PTR_ERR(fence); + if (!fence) + return -ENOMEM; drm_syncobj_replace_fence(syncobj, fence); dma_fence_put(fence); -- cgit v1.2.3 From 15008052b34efaa86c1d56190ac73c4bf8c462f9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Thu, 6 Jul 2023 17:30:31 +0200 Subject: drm/fbdev-dma: Fix documented default preferred_bpp value As of commit 6c80a93be62d398e ("drm/fb-helper: Initialize fb-helper's preferred BPP in prepare function"), the preferred_bpp parameter of drm_fb_helper_prepare() defaults to 32 instead of drm_mode_config.preferred_depth. Hence this also applies to drm_fbdev_dma_setup(), which just passes its own preferred_bpp parameter. Fixes: b79fe9abd58bab73 ("drm/fbdev-dma: Implement fbdev emulation for GEM DMA helpers") Signed-off-by: Geert Uytterhoeven Reviewed-by: Thomas Zimmermann Signed-off-by: Thomas Zimmermann Link: https://patchwork.freedesktop.org/patch/msgid/91f093ffe436a9f94d58fb2bfbc1407f1ebe8bb0.1688656591.git.geert+renesas@glider.be --- drivers/gpu/drm/drm_fbdev_dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_fbdev_dma.c b/drivers/gpu/drm/drm_fbdev_dma.c index 728deffcc0d9..e85cdf69cd6c 100644 --- a/drivers/gpu/drm/drm_fbdev_dma.c +++ b/drivers/gpu/drm/drm_fbdev_dma.c @@ -218,7 +218,7 @@ static const struct drm_client_funcs drm_fbdev_dma_client_funcs = { * drm_fbdev_dma_setup() - Setup fbdev emulation for GEM DMA helpers * @dev: DRM device * @preferred_bpp: Preferred bits per pixel for the device. - * @dev->mode_config.preferred_depth is used if this is zero. + * 32 is used if this is zero. * * This function sets up fbdev emulation for GEM DMA drivers that support * dumb buffers with a virtual address and that can be mmap'ed. -- cgit v1.2.3 From 8a796565cec3601071cbbd27d6304e202019d014 Mon Sep 17 00:00:00 2001 From: Andres Freund Date: Fri, 7 Jul 2023 09:20:07 -0700 Subject: io_uring: Use io_schedule* in cqring wait I observed poor performance of io_uring compared to synchronous IO. That turns out to be caused by deeper CPU idle states entered with io_uring, due to io_uring using plain schedule(), whereas synchronous IO uses io_schedule(). The losses due to this are substantial. On my cascade lake workstation, t/io_uring from the fio repository e.g. yields regressions between 20% and 40% with the following command: ./t/io_uring -r 5 -X0 -d 1 -s 1 -c 1 -p 0 -S$use_sync -R 0 /mnt/t2/fio/write.0.0 This is repeatable with different filesystems, using raw block devices and using different block devices. Use io_schedule_prepare() / io_schedule_finish() in io_cqring_wait_schedule() to address the difference. After that using io_uring is on par or surpassing synchronous IO (using registered files etc makes it reliably win, but arguably is a less fair comparison). There are other calls to schedule() in io_uring/, but none immediately jump out to be similarly situated, so I did not touch them. Similarly, it's possible that mutex_lock_io() should be used, but it's not clear if there are cases where that matters. Cc: stable@vger.kernel.org # 5.10+ Cc: Pavel Begunkov Cc: io-uring@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Andres Freund Link: https://lore.kernel.org/r/20230707162007.194068-1-andres@anarazel.de [axboe: minor style fixup] Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index e8096d502a7c..7505de2428e0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -2489,6 +2489,8 @@ int io_run_task_work_sig(struct io_ring_ctx *ctx) static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, struct io_wait_queue *iowq) { + int token, ret; + if (unlikely(READ_ONCE(ctx->check_cq))) return 1; if (unlikely(!llist_empty(&ctx->work_llist))) @@ -2499,11 +2501,20 @@ static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, return -EINTR; if (unlikely(io_should_wake(iowq))) return 0; + + /* + * Use io_schedule_prepare/finish, so cpufreq can take into account + * that the task is waiting for IO - turns out to be important for low + * QD IO. + */ + token = io_schedule_prepare(); + ret = 0; if (iowq->timeout == KTIME_MAX) schedule(); else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) - return -ETIME; - return 0; + ret = -ETIME; + io_schedule_finish(token); + return ret; } /* -- cgit v1.2.3 From 8bbe9fee5848371d4af101be445303cac8d880c5 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 17 May 2023 22:30:33 +1000 Subject: powerpc/64s: Fix native_hpte_remove() to be irq-safe Lockdep warns that the use of the hpte_lock in native_hpte_remove() is not safe against an IRQ coming in: ================================ WARNING: inconsistent lock state 6.4.0-rc2-g0c54f4d30ecc #1 Not tainted -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. qemu-system-ppc/93865 [HC0[0]:SC0[0]:HE1:SE1] takes: c0000000021f5180 (hpte_lock){+.?.}-{0:0}, at: native_lock_hpte+0x8/0xd0 {IN-SOFTIRQ-W} state was registered at: lock_acquire+0x134/0x3f0 native_lock_hpte+0x44/0xd0 native_hpte_insert+0xd4/0x2a0 __hash_page_64K+0x218/0x4f0 hash_page_mm+0x464/0x840 do_hash_fault+0x11c/0x260 data_access_common_virt+0x210/0x220 __ip_select_ident+0x140/0x150 ... net_rx_action+0x3bc/0x440 __do_softirq+0x180/0x534 ... sys_sendmmsg+0x34/0x50 system_call_exception+0x128/0x320 system_call_common+0x160/0x2e4 ... Possible unsafe locking scenario: CPU0 ---- lock(hpte_lock); lock(hpte_lock); *** DEADLOCK *** ... Call Trace: dump_stack_lvl+0x98/0xe0 (unreliable) print_usage_bug.part.0+0x250/0x278 mark_lock+0xc9c/0xd30 __lock_acquire+0x440/0x1ca0 lock_acquire+0x134/0x3f0 native_lock_hpte+0x44/0xd0 native_hpte_remove+0xb0/0x190 kvmppc_mmu_map_page+0x650/0x698 [kvm_pr] kvmppc_handle_pagefault+0x534/0x6e8 [kvm_pr] kvmppc_handle_exit_pr+0x6d8/0xe90 [kvm_pr] after_sprg3_load+0x80/0x90 [kvm_pr] kvmppc_vcpu_run_pr+0x108/0x270 [kvm_pr] kvmppc_vcpu_run+0x34/0x48 [kvm] kvm_arch_vcpu_ioctl_run+0x340/0x470 [kvm] kvm_vcpu_ioctl+0x338/0x8b8 [kvm] sys_ioctl+0x7c4/0x13e0 system_call_exception+0x128/0x320 system_call_common+0x160/0x2e4 I suspect kvm_pr is the only caller that doesn't already have IRQs disabled, which is why this hasn't been reported previously. Fix it by disabling IRQs in native_hpte_remove(). Fixes: 35159b5717fa ("powerpc/64s: make HPTE lock and native_tlbie_lock irq-safe") Cc: stable@vger.kernel.org # v6.1+ Signed-off-by: Michael Ellerman Link: https://msgid.link/20230517123033.18430-1-mpe@ellerman.id.au --- arch/powerpc/mm/book3s64/hash_native.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 9342e79870df..430d1d935a7c 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -328,10 +328,12 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, static long native_hpte_remove(unsigned long hpte_group) { + unsigned long hpte_v, flags; struct hash_pte *hptep; int i; int slot_offset; - unsigned long hpte_v; + + local_irq_save(flags); DBG_LOW(" remove(group=%lx)\n", hpte_group); @@ -356,13 +358,16 @@ static long native_hpte_remove(unsigned long hpte_group) slot_offset &= 0x7; } - if (i == HPTES_PER_GROUP) - return -1; + if (i == HPTES_PER_GROUP) { + i = -1; + goto out; + } /* Invalidate the hpte. NOTE: this also unlocks it */ release_hpte_lock(); hptep->v = 0; - +out: + local_irq_restore(flags); return i; } -- cgit v1.2.3 From 5bcedc5931e7bd6928a2d8207078d4cb476b3b55 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Wed, 17 May 2023 17:49:45 +1000 Subject: powerpc/security: Fix Speculation_Store_Bypass reporting on Power10 Nageswara reported that /proc/self/status was showing "vulnerable" for the Speculation_Store_Bypass feature on Power10, eg: $ grep Speculation_Store_Bypass: /proc/self/status Speculation_Store_Bypass: vulnerable But at the same time the sysfs files, and lscpu, were showing "Not affected". This turns out to simply be a bug in the reporting of the Speculation_Store_Bypass, aka. PR_SPEC_STORE_BYPASS, case. When SEC_FTR_STF_BARRIER was added, so that firmware could communicate the vulnerability was not present, the code in ssb_prctl_get() was not updated to check the new flag. So add the check for SEC_FTR_STF_BARRIER being disabled. Rather than adding the new check to the existing if block and expanding the comment to cover both cases, rewrite the three cases to be separate so they can be commented separately for clarity. Fixes: 84ed26fd00c5 ("powerpc/security: Add a security feature for STF barrier") Cc: stable@vger.kernel.org # v5.14+ Reported-by: Nageswara R Sastry Tested-by: Nageswara R Sastry Reviewed-by: Russell Currey Signed-off-by: Michael Ellerman Link: https://msgid.link/20230517074945.53188-1-mpe@ellerman.id.au --- arch/powerpc/kernel/security.c | 37 +++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 206475e3e0b4..4856e1a5161c 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -364,26 +364,27 @@ ssize_t cpu_show_spec_store_bypass(struct device *dev, struct device_attribute * static int ssb_prctl_get(struct task_struct *task) { + /* + * The STF_BARRIER feature is on by default, so if it's off that means + * firmware has explicitly said the CPU is not vulnerable via either + * the hypercall or device tree. + */ + if (!security_ftr_enabled(SEC_FTR_STF_BARRIER)) + return PR_SPEC_NOT_AFFECTED; + + /* + * If the system's CPU has no known barrier (see setup_stf_barrier()) + * then assume that the CPU is not vulnerable. + */ if (stf_enabled_flush_types == STF_BARRIER_NONE) - /* - * We don't have an explicit signal from firmware that we're - * vulnerable or not, we only have certain CPU revisions that - * are known to be vulnerable. - * - * We assume that if we're on another CPU, where the barrier is - * NONE, then we are not vulnerable. - */ return PR_SPEC_NOT_AFFECTED; - else - /* - * If we do have a barrier type then we are vulnerable. The - * barrier is not a global or per-process mitigation, so the - * only value we can report here is PR_SPEC_ENABLE, which - * appears as "vulnerable" in /proc. - */ - return PR_SPEC_ENABLE; - - return -EINVAL; + + /* + * Otherwise the CPU is vulnerable. The barrier is not a global or + * per-process mitigation, so the only value that can be reported here + * is PR_SPEC_ENABLE, which appears as "vulnerable" in /proc. + */ + return PR_SPEC_ENABLE; } int arch_prctl_spec_ctrl_get(struct task_struct *task, unsigned long which) -- cgit v1.2.3 From cf65b12c17b4910d099d78f6ed6919ec040ecdbc Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 22 Jun 2023 21:24:51 +1000 Subject: powerpc/64e: Fix obtool warnings in exceptions-64e.S Since commit aec0ba7472a7 ("powerpc/64: Use -mprofile-kernel for big endian ELFv2 kernels"), this file is checked by objtool. Fix warnings such as: arch/powerpc/kernel/idle_64e.o: warning: objtool: .text+0x20: unannotated intra-function call arch/powerpc/kernel/exceptions-64e.o: warning: objtool: .text+0x218: unannotated intra-function call Signed-off-by: Michael Ellerman Link: https://msgid.link/20230622112451.735268-1-mpe@ellerman.id.au --- arch/powerpc/kernel/exceptions-64e.S | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 3f86091e68b3..7ab4c8c0f1ab 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -5,6 +5,7 @@ * Copyright (C) 2007 Ben. Herrenschmidt (benh@kernel.crashing.org), IBM Corp. */ +#include #include #include #include @@ -66,7 +67,7 @@ #define SPECIAL_EXC_LOAD(reg, name) \ ld reg, (SPECIAL_EXC_##name * 8 + SPECIAL_EXC_FRAME_OFFS)(r1) -special_reg_save: +SYM_CODE_START_LOCAL(special_reg_save) /* * We only need (or have stack space) to save this stuff if * we interrupted the kernel. @@ -131,8 +132,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) SPECIAL_EXC_STORE(r10,CSRR1) blr +SYM_CODE_END(special_reg_save) -ret_from_level_except: +SYM_CODE_START_LOCAL(ret_from_level_except) ld r3,_MSR(r1) andi. r3,r3,MSR_PR beq 1f @@ -206,6 +208,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) mtxer r11 blr +SYM_CODE_END(ret_from_level_except) .macro ret_from_level srr0 srr1 paca_ex scratch bl ret_from_level_except @@ -232,13 +235,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_EMB_HV) mfspr r13,\scratch .endm -ret_from_crit_except: +SYM_CODE_START_LOCAL(ret_from_crit_except) ret_from_level SPRN_CSRR0 SPRN_CSRR1 PACA_EXCRIT SPRN_SPRG_CRIT_SCRATCH rfci +SYM_CODE_END(ret_from_crit_except) -ret_from_mc_except: +SYM_CODE_START_LOCAL(ret_from_mc_except) ret_from_level SPRN_MCSRR0 SPRN_MCSRR1 PACA_EXMC SPRN_SPRG_MC_SCRATCH rfmci +SYM_CODE_END(ret_from_mc_except) /* Exception prolog code for all exceptions */ #define EXCEPTION_PROLOG(n, intnum, type, addition) \ @@ -978,20 +983,22 @@ masked_interrupt_book3e_0x2c0: * r14 and r15 containing the fault address and error code, with the * original values stashed away in the PACA */ -storage_fault_common: +SYM_CODE_START_LOCAL(storage_fault_common) addi r3,r1,STACK_INT_FRAME_REGS bl do_page_fault b interrupt_return +SYM_CODE_END(storage_fault_common) /* * Alignment exception doesn't fit entirely in the 0x100 bytes so it * continues here. */ -alignment_more: +SYM_CODE_START_LOCAL(alignment_more) addi r3,r1,STACK_INT_FRAME_REGS bl alignment_exception REST_NVGPRS(r1) b interrupt_return +SYM_CODE_END(alignment_more) /* * Trampolines used when spotting a bad kernel stack pointer in @@ -1030,8 +1037,7 @@ BAD_STACK_TRAMPOLINE(0xe00) BAD_STACK_TRAMPOLINE(0xf00) BAD_STACK_TRAMPOLINE(0xf20) - .globl bad_stack_book3e -bad_stack_book3e: +_GLOBAL(bad_stack_book3e) /* XXX: Needs to make SPRN_SPRG_GEN depend on exception type */ mfspr r10,SPRN_SRR0; /* read SRR0 before touching stack */ ld r1,PACAEMERGSP(r13) @@ -1285,8 +1291,7 @@ have_hes: * ever takes any parameters, the SCOM code must also be updated to * provide them. */ - .globl a2_tlbinit_code_start -a2_tlbinit_code_start: +_GLOBAL(a2_tlbinit_code_start) ori r11,r3,MAS0_WQ_ALLWAYS oris r11,r11,MAS0_ESEL(3)@h /* Use way 3: workaround A2 erratum 376 */ @@ -1479,8 +1484,7 @@ _GLOBAL(book3e_secondary_thread_init) mflr r28 b 3b - .globl init_core_book3e -init_core_book3e: +_GLOBAL(init_core_book3e) /* Establish the interrupt vector base */ tovirt(r2,r2) LOAD_REG_ADDR(r3, interrupt_base_book3e) @@ -1488,7 +1492,7 @@ init_core_book3e: sync blr -init_thread_book3e: +SYM_CODE_START_LOCAL(init_thread_book3e) lis r3,(SPRN_EPCR_ICM | SPRN_EPCR_GICM)@h mtspr SPRN_EPCR,r3 @@ -1502,6 +1506,7 @@ init_thread_book3e: mtspr SPRN_TSR,r3 blr +SYM_CODE_END(init_thread_book3e) _GLOBAL(__setup_base_ivors) SET_IVOR(0, 0x020) /* Critical Input */ -- cgit v1.2.3 From cf53564b11cef5cdfafc548b172345c9aa753f89 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Jul 2023 07:54:05 +0530 Subject: powerpc/mm/book3s64/hash/4k: Add pmd_same callback for 4K page size With commit 0d940a9b270b ("mm/pgtable: allow pte_offset_map[_lock]() to fail") the kernel is now using pmd_same to compare pmd values that are pointing to a level 4 page table page. Move the functions out of #ifdef CONFIG_TRANSPARENT_HUGEPAGE and add a variant that can work with both 4K and 64K page size. kernel BUG at arch/powerpc/include/asm/book3s/64/hash-4k.h:141! Oops: Exception in kernel mode, sig: 5 [#1] LE PAGE_SIZE=4K MMU=Hash SMP NR_CPUS=2048 NUMA pSeries ..... NIP [c00000000048aee0] __pte_offset_map_lock+0xf0/0x164 LR [c00000000048ae78] __pte_offset_map_lock+0x88/0x164 Call Trace: 0xc0003f000009a340 (unreliable) __handle_mm_fault+0x1340/0x1980 handle_mm_fault+0xbc/0x380 __get_user_pages+0x320/0x550 get_user_pages_remote+0x13c/0x520 get_arg_page+0x80/0x1d0 copy_string_kernel+0xc8/0x250 kernel_execve+0x11c/0x270 run_init_process+0xe4/0x10c kernel_init+0xbc/0x1a0 ret_from_kernel_user_thread+0x14/0x1c Reported-by: Michael Ellerman Signed-off-by: "Aneesh Kumar K.V" Acked-by: Hugh Dickins Signed-off-by: Michael Ellerman Link: https://msgid.link/20230706022405.798157-1-aneesh.kumar@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 6 ------ arch/powerpc/include/asm/book3s/64/hash-64k.h | 5 ----- arch/powerpc/include/asm/book3s/64/hash.h | 5 +++++ 3 files changed, 5 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index b6ac4f86c87b..6472b08fa1b0 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -136,12 +136,6 @@ static inline int hash__pmd_trans_huge(pmd_t pmd) return 0; } -static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) -{ - BUG(); - return 0; -} - static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) { BUG(); diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 338e62fbea0b..0bf6fd0bf42a 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -263,11 +263,6 @@ static inline int hash__pmd_trans_huge(pmd_t pmd) (_PAGE_PTE | H_PAGE_THP_HUGE)); } -static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) -{ - return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); -} - static inline pmd_t hash__pmd_mkhuge(pmd_t pmd) { return __pmd(pmd_val(pmd) | (_PAGE_PTE | H_PAGE_THP_HUGE)); diff --git a/arch/powerpc/include/asm/book3s/64/hash.h b/arch/powerpc/include/asm/book3s/64/hash.h index 17e7a778c856..d4a19e6547ac 100644 --- a/arch/powerpc/include/asm/book3s/64/hash.h +++ b/arch/powerpc/include/asm/book3s/64/hash.h @@ -132,6 +132,11 @@ static inline int get_region_id(unsigned long ea) return region_id; } +static inline int hash__pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return (((pmd_raw(pmd_a) ^ pmd_raw(pmd_b)) & ~cpu_to_be64(_PAGE_HPTEFLAGS)) == 0); +} + #define hash__pmd_bad(pmd) (pmd_val(pmd) & H_PMD_BAD_BITS) #define hash__pud_bad(pud) (pud_val(pud) & H_PUD_BAD_BITS) static inline int hash__p4d_bad(p4d_t p4d) -- cgit v1.2.3 From 27c68c216ee1f1b086e789a64486e6511e380b8a Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 4 Jul 2023 11:15:15 -0700 Subject: perf/x86: Fix lockdep warning in for_each_sibling_event() on SPR On SPR, the load latency event needs an auxiliary event in the same group to work properly. There's a check in intel_pmu_hw_config() for this to iterate sibling events and find a mem-loads-aux event. The for_each_sibling_event() has a lockdep assert to make sure if it disabled hardirq or hold leader->ctx->mutex. This works well if the given event has a separate leader event since perf_try_init_event() grabs the leader->ctx->mutex to protect the sibling list. But it can cause a problem when the event itself is a leader since the event is not initialized yet and there's no ctx for the event. Actually I got a lockdep warning when I run the below command on SPR, but I guess it could be a NULL pointer dereference. $ perf record -d -e cpu/mem-loads/uP true The code path to the warning is: sys_perf_event_open() perf_event_alloc() perf_init_event() perf_try_init_event() x86_pmu_event_init() hsw_hw_config() intel_pmu_hw_config() for_each_sibling_event() lockdep_assert_event_ctx() We don't need for_each_sibling_event() when it's a standalone event. Let's return the error code directly. Fixes: f3c0eba28704 ("perf: Add a few assertions") Reported-by: Greg Thelen Signed-off-by: Namhyung Kim Signed-off-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20230704181516.3293665-1-namhyung@kernel.org --- arch/x86/events/intel/core.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index a149fafad813..2a284ba951b7 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3993,6 +3993,13 @@ static int intel_pmu_hw_config(struct perf_event *event) struct perf_event *leader = event->group_leader; struct perf_event *sibling = NULL; + /* + * When this memload event is also the first event (no group + * exists yet), then there is no aux event before it. + */ + if (leader == event) + return -ENODATA; + if (!is_mem_loads_aux_event(leader)) { for_each_sibling_event(sibling, leader) { if (is_mem_loads_aux_event(sibling)) -- cgit v1.2.3 From 0479a42d4c15bd554f54d89d12bf68218e3e70da Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Jun 2023 16:27:13 +0200 Subject: x86/cfi: Extend {JMP,CAKK}_NOSPEC comment With the introduction of kCFI these helpers are no longer equivalent to C indirect calls and should be used with care. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20230622144321.360957723%40infradead.org --- arch/x86/include/asm/nospec-branch.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 55388c9f7601..1a65cf4acb2b 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -234,6 +234,10 @@ * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple * indirect jmp/call which may be susceptible to the Spectre variant 2 * attack. + * + * NOTE: these do not take kCFI into account and are thus not comparable to C + * indirect calls, take care when using. The target of these should be an ENDBR + * instruction irrespective of kCFI. */ .macro JMP_NOSPEC reg:req #ifdef CONFIG_RETPOLINE -- cgit v1.2.3 From be0fffa5ca894a971a31c5e28aa77b633a97d1dc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 22 Jun 2023 15:36:50 +0200 Subject: x86/alternative: Rename apply_ibt_endbr() The current name doesn't reflect what it does very well. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20230622144321.427441595%40infradead.org --- arch/um/kernel/um_arch.c | 2 +- arch/x86/include/asm/alternative.h | 2 +- arch/x86/include/asm/ibt.h | 2 +- arch/x86/kernel/alternative.c | 9 ++++++--- arch/x86/kernel/module.c | 2 +- 5 files changed, 10 insertions(+), 7 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 918fed7ad4d8..b1bfed0c8528 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -437,7 +437,7 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_ibt_endbr(s32 *start, s32 *end) +void apply_seal_endbr(s32 *start, s32 *end) { } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 6c15a622ad60..9c4da699e11a 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -96,7 +96,7 @@ extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); extern void apply_retpolines(s32 *start, s32 *end); extern void apply_returns(s32 *start, s32 *end); -extern void apply_ibt_endbr(s32 *start, s32 *end); +extern void apply_seal_endbr(s32 *start, s32 *end); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, s32 *start_cfi, s32 *end_cfi); diff --git a/arch/x86/include/asm/ibt.h b/arch/x86/include/asm/ibt.h index baae6b4fea23..1e59581d500c 100644 --- a/arch/x86/include/asm/ibt.h +++ b/arch/x86/include/asm/ibt.h @@ -34,7 +34,7 @@ /* * Create a dummy function pointer reference to prevent objtool from marking * the function as needing to be "sealed" (i.e. ENDBR converted to NOP by - * apply_ibt_endbr()). + * apply_seal_endbr()). */ #define IBT_NOSEAL(fname) \ ".pushsection .discard.ibt_endbr_noseal\n\t" \ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 72646d75b6ff..27e0cb4d062a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -803,7 +803,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn) /* * Generated by: objtool --ibt */ -void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { s32 *s; @@ -818,7 +818,7 @@ void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end) #else -void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1565,7 +1565,10 @@ void __init alternative_instructions(void) */ callthunks_patch_builtin_calls(); - apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + /* + * Seal all functions that do not have their address taken. + */ + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index b05f62ee2344..5f71a0cf4399 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -358,7 +358,7 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_ibt_endbr(iseg, iseg + ibt_endbr->sh_size); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); } if (locks) { void *lseg = (void *)locks->sh_addr; -- cgit v1.2.3 From 9831c6253ace48051189f6d18a15f658f94babc2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Jun 2023 22:17:12 +0200 Subject: x86/cfi: Extend ENDBR sealing to kCFI Kees noted that IBT sealing could be extended to kCFI. Fundamentally it is the list of functions that do not have their address taken and are thus never called indirectly. It doesn't matter that objtool uses IBT infrastructure to determine this list, once we have it it can also be used to clobber kCFI hashes and avoid kCFI indirect calls. Suggested-by: Kees Cook Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20230622144321.494426891%40infradead.org --- arch/x86/kernel/alternative.c | 44 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 27e0cb4d062a..04b25a275233 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -778,6 +778,8 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } #ifdef CONFIG_X86_KERNEL_IBT +static void poison_cfi(void *addr); + static void __init_or_module poison_endbr(void *addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); @@ -802,6 +804,9 @@ static void __init_or_module poison_endbr(void *addr, bool warn) /* * Generated by: objtool --ibt + * + * Seal the functions for indirect calls by clobbering the ENDBR instructions + * and the kCFI hash value. */ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) { @@ -812,7 +817,7 @@ void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) poison_endbr(addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_endbr(addr - 16, false); + poison_cfi(addr - 16); } } @@ -1177,6 +1182,41 @@ err: pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n"); } +static inline void poison_hash(void *addr) +{ + *(u32 *)addr = 0; +} + +static void poison_cfi(void *addr) +{ + switch (cfi_mode) { + case CFI_FINEIBT: + /* + * __cfi_\func: + * osp nopl (%rax) + * subl $0, %r10d + * jz 1f + * ud2 + * 1: nop + */ + poison_endbr(addr, false); + poison_hash(addr + fineibt_preamble_hash); + break; + + case CFI_KCFI: + /* + * __cfi_\func: + * movl $0, %eax + * .skip 11, 0x90 + */ + poison_hash(addr + 1); + break; + + default: + break; + } +} + #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, @@ -1184,6 +1224,8 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, { } +static void poison_cfi(void *addr) { } + #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, -- cgit v1.2.3 From 81f755d561f365f544795fad92f05a085ea4f292 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 23 Jun 2023 18:55:28 -0400 Subject: x86/32: Remove schedule_tail_wrapper() The unwinder expects a return address at the very top of the kernel stack just below pt_regs and before any stack frame is created. Instead of calling a wrapper, set up a return address as if ret_from_fork() was called from the syscall entry code. Signed-off-by: Brian Gerst Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20230623225529.34590-2-brgerst@gmail.com --- arch/x86/entry/entry_32.S | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 91397f58ac30..e56123f03a79 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -719,26 +719,6 @@ SYM_CODE_START(__switch_to_asm) SYM_CODE_END(__switch_to_asm) .popsection -/* - * The unwinder expects the last frame on the stack to always be at the same - * offset from the end of the page, which allows it to validate the stack. - * Calling schedule_tail() directly would break that convention because its an - * asmlinkage function so its argument has to be pushed on the stack. This - * wrapper creates a proper "end of stack" frame header before the call. - */ -.pushsection .text, "ax" -SYM_FUNC_START(schedule_tail_wrapper) - FRAME_BEGIN - - pushl %eax - call schedule_tail - popl %eax - - FRAME_END - RET -SYM_FUNC_END(schedule_tail_wrapper) -.popsection - /* * A newly forked process directly context switches into this address. * @@ -748,16 +728,23 @@ SYM_FUNC_END(schedule_tail_wrapper) */ .pushsection .text, "ax" SYM_CODE_START(ret_from_fork) - call schedule_tail_wrapper + /* return address for the stack unwinder */ + pushl $.Lsyscall_32_done + + FRAME_BEGIN + pushl %eax + call schedule_tail + addl $4, %esp + FRAME_END testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - movl %esp, %eax + leal 4(%esp), %eax call syscall_exit_to_user_mode - jmp .Lsyscall_32_done + RET /* kernel thread */ 1: movl %edi, %eax -- cgit v1.2.3 From 3aec4ecb3d1f313a8ab985df7cab07c4af81f478 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Fri, 23 Jun 2023 18:55:29 -0400 Subject: x86: Rewrite ret_from_fork() in C When kCFI is enabled, special handling is needed for the indirect call to the kernel thread function. Rewrite the ret_from_fork() function in C so that the compiler can properly handle the indirect call. Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Brian Gerst Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lkml.kernel.org/r/20230623225529.34590-3-brgerst@gmail.com --- arch/x86/entry/entry_32.S | 30 ++++++++---------------------- arch/x86/entry/entry_64.S | 33 ++++++++------------------------- arch/x86/include/asm/switch_to.h | 4 +++- arch/x86/kernel/process.c | 22 +++++++++++++++++++++- 4 files changed, 40 insertions(+), 49 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index e56123f03a79..6e6af42e044a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -727,36 +727,22 @@ SYM_CODE_END(__switch_to_asm) * edi: kernel thread arg */ .pushsection .text, "ax" -SYM_CODE_START(ret_from_fork) +SYM_CODE_START(ret_from_fork_asm) + movl %esp, %edx /* regs */ + /* return address for the stack unwinder */ pushl $.Lsyscall_32_done FRAME_BEGIN - pushl %eax - call schedule_tail + /* prev already in EAX */ + movl %ebx, %ecx /* fn */ + pushl %edi /* fn_arg */ + call ret_from_fork addl $4, %esp FRAME_END - testl %ebx, %ebx - jnz 1f /* kernel threads are uncommon */ - -2: - /* When we fork, we trace the syscall return in the child, too. */ - leal 4(%esp), %eax - call syscall_exit_to_user_mode RET - - /* kernel thread */ -1: movl %edi, %eax - CALL_NOSPEC ebx - /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. - */ - movl $0, PT_EAX(%esp) - jmp 2b -SYM_CODE_END(ret_from_fork) +SYM_CODE_END(ret_from_fork_asm) .popsection SYM_ENTRY(__begin_SYSENTER_singlestep_region, SYM_L_GLOBAL, SYM_A_NONE) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f31e286c2977..91f6818884fa 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -284,36 +284,19 @@ SYM_FUNC_END(__switch_to_asm) * r12: kernel thread arg */ .pushsection .text, "ax" - __FUNC_ALIGN -SYM_CODE_START_NOALIGN(ret_from_fork) - UNWIND_HINT_END_OF_STACK +SYM_CODE_START(ret_from_fork_asm) + UNWIND_HINT_REGS ANNOTATE_NOENDBR // copy_thread CALL_DEPTH_ACCOUNT - movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + movq %rax, %rdi /* prev */ + movq %rsp, %rsi /* regs */ + movq %rbx, %rdx /* fn */ + movq %r12, %rcx /* fn_arg */ + call ret_from_fork -2: - UNWIND_HINT_REGS - movq %rsp, %rdi - call syscall_exit_to_user_mode /* returns with IRQs disabled */ jmp swapgs_restore_regs_and_return_to_usermode - -1: - /* kernel thread */ - UNWIND_HINT_END_OF_STACK - movq %r12, %rdi - CALL_NOSPEC rbx - /* - * A kernel thread is allowed to return here after successfully - * calling kernel_execve(). Exit to userspace to complete the execve() - * syscall. - */ - movq $0, RAX(%rsp) - jmp 2b -SYM_CODE_END(ret_from_fork) +SYM_CODE_END(ret_from_fork_asm) .popsection .macro DEBUG_ENTRY_ASSERT_IRQS_OFF diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 5c91305d09d2..f42dbf17f52b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -12,7 +12,9 @@ struct task_struct *__switch_to_asm(struct task_struct *prev, __visible struct task_struct *__switch_to(struct task_struct *prev, struct task_struct *next); -asmlinkage void ret_from_fork(void); +asmlinkage void ret_from_fork_asm(void); +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg); /* * This is the structure pointed to by thread.sp for an inactive task. The diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ff9b80a0e3e3..72015dba72ab 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -134,6 +135,25 @@ static int set_new_tls(struct task_struct *p, unsigned long tls) return do_set_thread_area_64(p, ARCH_SET_FS, tls); } +__visible void ret_from_fork(struct task_struct *prev, struct pt_regs *regs, + int (*fn)(void *), void *fn_arg) +{ + schedule_tail(prev); + + /* Is this a kernel thread? */ + if (unlikely(fn)) { + fn(fn_arg); + /* + * A kernel thread is allowed to return here after successfully + * calling kernel_execve(). Exit to userspace to complete the + * execve() syscall. + */ + regs->ax = 0; + } + + syscall_exit_to_user_mode(regs); +} + int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) { unsigned long clone_flags = args->flags; @@ -149,7 +169,7 @@ int copy_thread(struct task_struct *p, const struct kernel_clone_args *args) frame = &fork_frame->frame; frame->bp = encode_frame_pointer(childregs); - frame->ret_addr = (unsigned long) ret_from_fork; + frame->ret_addr = (unsigned long) ret_from_fork_asm; p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap = NULL; p->thread.iopl_warn = 0; -- cgit v1.2.3 From 04505bbbbb15da950ea0239e328a76a3ad2376e0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 15 Jun 2023 21:35:48 +0200 Subject: x86/fineibt: Poison ENDBR at +0 Alyssa noticed that when building the kernel with CFI_CLANG+IBT and booting on IBT enabled hardware to obtain FineIBT, the indirect functions look like: __cfi_foo: endbr64 subl $hash, %r10d jz 1f ud2 nop 1: foo: endbr64 This is because the compiler generates code for kCFI+IBT. In that case the caller does the hash check and will jump to +0, so there must be an ENDBR there. The compiler doesn't know about FineIBT at all; also it is possible to actually use kCFI+IBT when booting with 'cfi=kcfi' on IBT enabled hardware. Having this second ENDBR however makes it possible to elide the CFI check. Therefore, we should poison this second ENDBR when switching to FineIBT mode. Fixes: 931ab63664f0 ("x86/ibt: Implement FineIBT") Reported-by: "Milburn, Alyssa" Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Kees Cook Reviewed-by: Sami Tolvanen Link: https://lore.kernel.org/r/20230615193722.194131053@infradead.org --- arch/x86/kernel/alternative.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 04b25a275233..d77aaabf9fe8 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1068,6 +1068,17 @@ static int cfi_rewrite_preamble(s32 *start, s32 *end) return 0; } +static void cfi_rewrite_endbr(s32 *start, s32 *end) +{ + s32 *s; + + for (s = start; s < end; s++) { + void *addr = (void *)s + *s; + + poison_endbr(addr+16, false); + } +} + /* .retpoline_sites */ static int cfi_rand_callers(s32 *start, s32 *end) { @@ -1162,14 +1173,19 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_FINEIBT: + /* place the FineIBT preamble at func()-16 */ ret = cfi_rewrite_preamble(start_cfi, end_cfi); if (ret) goto err; + /* rewrite the callers to target func()-16 */ ret = cfi_rewrite_callers(start_retpoline, end_retpoline); if (ret) goto err; + /* now that nobody targets func()+0, remove ENDBR there */ + cfi_rewrite_endbr(start_cfi, end_cfi); + if (builtin) pr_info("Using FineIBT CFI\n"); return; -- cgit v1.2.3 From 9f71fbcde2820f2af4658313e808cf1e579190a4 Mon Sep 17 00:00:00 2001 From: Michal Kubecek Date: Thu, 29 Jun 2023 12:05:05 +0200 Subject: objtool: initialize all of struct elf Function elf_open_read() only zero initializes the initial part of allocated struct elf; num_relocs member was recently added outside the zeroed part so that it was left uninitialized, resulting in build failures on some systems. The partial initialization is a relic of times when struct elf had large hash tables embedded. This is no longer the case so remove the trap and initialize the whole structure instead. Fixes: eb0481bbc4ce ("objtool: Fix reloc_hash size") Signed-off-by: Michal Kubecek Signed-off-by: Peter Zijlstra (Intel) Acked-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20230629102051.42E8360467@lion.mk-sys.cz --- tools/objtool/elf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/objtool/elf.c b/tools/objtool/elf.c index d420b5d2e2b6..081befa4674b 100644 --- a/tools/objtool/elf.c +++ b/tools/objtool/elf.c @@ -1005,7 +1005,7 @@ struct elf *elf_open_read(const char *name, int flags) perror("malloc"); return NULL; } - memset(elf, 0, offsetof(struct elf, sections)); + memset(elf, 0, sizeof(*elf)); INIT_LIST_HEAD(&elf->sections); -- cgit v1.2.3 From 719a937b7003933de1298ffa4b881dd6a234e244 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 16 Jun 2023 14:43:55 +0200 Subject: iov_iter: Mark copy_iovec_from_user() noclone Extend commit 50f9a76ef127 ("iov_iter: Mark copy_compat_iovec_from_user() noinline") to also cover copy_iovec_from_user(). Different compiler versions cause the same problem on different functions. lib/iov_iter.o: warning: objtool: .altinstr_replacement+0x1f: redundant UACCESS disable lib/iov_iter.o: warning: objtool: iovec_from_user+0x84: call to copy_iovec_from_user.part.0() with UACCESS enabled lib/iov_iter.o: warning: objtool: __import_iovec+0x143: call to copy_iovec_from_user.part.0() with UACCESS enabled Fixes: 50f9a76ef127 ("iov_iter: Mark copy_compat_iovec_from_user() noinline") Signed-off-by: Peter Zijlstra (Intel) Tested-by: Borislav Petkov (AMD) Link: https://lkml.kernel.org/r/20230616124354.GD4253@hirez.programming.kicks-ass.net --- lib/iov_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index b667b1e2f688..e4dc809d1075 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -1349,7 +1349,7 @@ uaccess_end: return ret; } -static int copy_iovec_from_user(struct iovec *iov, +static __noclone int copy_iovec_from_user(struct iovec *iov, const struct iovec __user *uiov, unsigned long nr_segs) { int ret = -EFAULT; -- cgit v1.2.3 From ae2ad293d6be143ad223f5f947cca07bcbe42595 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 20 Jun 2023 16:07:47 +0800 Subject: sched/fair: Use recent_used_cpu to test p->cpus_ptr When checking whether a recently used CPU can be a potential idle candidate, recent_used_cpu should be used to test p->cpus_ptr as p->recent_used_cpu is not equal to recent_used_cpu and candidate decision is made based on recent_used_cpu here. Fixes: 89aafd67f28c ("sched/fair: Use prev instead of new target as recent_used_cpu") Signed-off-by: Miaohe Lin Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Phil Auld Acked-by: Mel Gorman Link: https://lore.kernel.org/r/20230620080747.359122-1-linmiaohe@huawei.com --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a80a73909dc2..b3e25be58e2b 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7174,7 +7174,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { return recent_used_cpu; } -- cgit v1.2.3 From aff037078ecaecf34a7c2afab1341815f90fba5e Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 29 Jun 2023 17:56:12 -0700 Subject: sched/psi: use kernfs polling functions for PSI trigger polling Destroying psi trigger in cgroup_file_release causes UAF issues when a cgroup is removed from under a polling process. This is happening because cgroup removal causes a call to cgroup_file_release while the actual file is still alive. Destroying the trigger at this point would also destroy its waitqueue head and if there is still a polling process on that file accessing the waitqueue, it will step on the freed pointer: do_select vfs_poll do_rmdir cgroup_rmdir kernfs_drain_open_files cgroup_file_release cgroup_pressure_release psi_trigger_destroy wake_up_pollfree(&t->event_wait) // vfs_poll is unblocked synchronize_rcu kfree(t) poll_freewait -> UAF access to the trigger's waitqueue head Patch [1] fixed this issue for epoll() case using wake_up_pollfree(), however the same issue exists for synchronous poll() case. The root cause of this issue is that the lifecycles of the psi trigger's waitqueue and of the file associated with the trigger are different. Fix this by using kernfs_generic_poll function when polling on cgroup-specific psi triggers. It internally uses kernfs_open_node->poll waitqueue head with its lifecycle tied to the file's lifecycle. This also renders the fix in [1] obsolete, so revert it. [1] commit c2dbe32d5db5 ("sched/psi: Fix use-after-free in ep_remove_wait_queue()") Fixes: 0e94682b73bf ("psi: introduce psi monitor") Closes: https://lore.kernel.org/all/20230613062306.101831-1-lujialin4@huawei.com/ Reported-by: Lu Jialin Signed-off-by: Suren Baghdasaryan Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20230630005612.1014540-1-surenb@google.com --- include/linux/psi.h | 5 +++-- include/linux/psi_types.h | 3 +++ kernel/cgroup/cgroup.c | 2 +- kernel/sched/psi.c | 29 +++++++++++++++++++++-------- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/include/linux/psi.h b/include/linux/psi.h index ab26200c2803..e0745873e3f2 100644 --- a/include/linux/psi.h +++ b/include/linux/psi.h @@ -23,8 +23,9 @@ void psi_memstall_enter(unsigned long *flags); void psi_memstall_leave(unsigned long *flags); int psi_show(struct seq_file *s, struct psi_group *group, enum psi_res res); -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res, struct file *file); +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of); void psi_trigger_destroy(struct psi_trigger *t); __poll_t psi_trigger_poll(void **trigger_ptr, struct file *file, diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h index 040c089581c6..f1fd3a8044e0 100644 --- a/include/linux/psi_types.h +++ b/include/linux/psi_types.h @@ -137,6 +137,9 @@ struct psi_trigger { /* Wait queue for polling */ wait_queue_head_t event_wait; + /* Kernfs file for cgroup triggers */ + struct kernfs_open_file *of; + /* Pending event flag */ int event; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index bfe3cd8ccf36..f55a40db065f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3730,7 +3730,7 @@ static ssize_t pressure_write(struct kernfs_open_file *of, char *buf, } psi = cgroup_psi(cgrp); - new = psi_trigger_create(psi, buf, res, of->file); + new = psi_trigger_create(psi, buf, res, of->file, of); if (IS_ERR(new)) { cgroup_put(cgrp); return PTR_ERR(new); diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 81fca77397f6..9bb3f2b3ccfc 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -493,8 +493,12 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, continue; /* Generate an event */ - if (cmpxchg(&t->event, 0, 1) == 0) - wake_up_interruptible(&t->event_wait); + if (cmpxchg(&t->event, 0, 1) == 0) { + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); + } t->last_event_time = now; /* Reset threshold breach flag once event got generated */ t->pending_event = false; @@ -1271,8 +1275,9 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) return 0; } -struct psi_trigger *psi_trigger_create(struct psi_group *group, - char *buf, enum psi_res res, struct file *file) +struct psi_trigger *psi_trigger_create(struct psi_group *group, char *buf, + enum psi_res res, struct file *file, + struct kernfs_open_file *of) { struct psi_trigger *t; enum psi_states state; @@ -1331,7 +1336,9 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, t->event = 0; t->last_event_time = 0; - init_waitqueue_head(&t->event_wait); + t->of = of; + if (!of) + init_waitqueue_head(&t->event_wait); t->pending_event = false; t->aggregator = privileged ? PSI_POLL : PSI_AVGS; @@ -1388,7 +1395,10 @@ void psi_trigger_destroy(struct psi_trigger *t) * being accessed later. Can happen if cgroup is deleted from under a * polling process. */ - wake_up_pollfree(&t->event_wait); + if (t->of) + kernfs_notify(t->of->kn); + else + wake_up_interruptible(&t->event_wait); if (t->aggregator == PSI_AVGS) { mutex_lock(&group->avgs_lock); @@ -1465,7 +1475,10 @@ __poll_t psi_trigger_poll(void **trigger_ptr, if (!t) return DEFAULT_POLLMASK | EPOLLERR | EPOLLPRI; - poll_wait(file, &t->event_wait, wait); + if (t->of) + kernfs_generic_poll(t->of, wait); + else + poll_wait(file, &t->event_wait, wait); if (cmpxchg(&t->event, 1, 0) == 1) ret |= EPOLLPRI; @@ -1535,7 +1548,7 @@ static ssize_t psi_write(struct file *file, const char __user *user_buf, return -EBUSY; } - new = psi_trigger_create(&psi_system, buf, res, file); + new = psi_trigger_create(&psi_system, buf, res, file, NULL); if (IS_ERR(new)) { mutex_unlock(&seq->lock); return PTR_ERR(new); -- cgit v1.2.3 From f46a0b47cc0829acd050213194c5a77351e619b2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 3 Jul 2023 17:07:06 +0200 Subject: pinctrl: renesas: rzv2m: Handle non-unique subnode names The eMMC and SDHI pin control configuration nodes in DT have subnodes with the same names ("data" and "ctrl"). As the RZ/V2M pin control driver considers only the names of the subnodes, this leads to conflicts: pinctrl-rzv2m b6250000.pinctrl: pin P8_2 already requested by 85000000.mmc; cannot claim for 85020000.mmc pinctrl-rzv2m b6250000.pinctrl: pin-130 (85020000.mmc) status -22 renesas_sdhi_internal_dmac 85020000.mmc: Error applying setting, reverse things back Fix this by constructing unique names from the node names of both the pin control configuration node and its child node, where appropriate. Reported by: Fabrizio Castro Fixes: 92a9b825257614af ("pinctrl: renesas: Add RZ/V2M pin and gpio controller driver") Signed-off-by: Geert Uytterhoeven Tested-by: Fabrizio Castro Link: https://lore.kernel.org/r/607bd6ab4905b0b1b119a06ef953fa1184505777.1688396717.git.geert+renesas@glider.be --- drivers/pinctrl/renesas/pinctrl-rzv2m.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzv2m.c b/drivers/pinctrl/renesas/pinctrl-rzv2m.c index e5472293bc7f..35b23c1a5684 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzv2m.c +++ b/drivers/pinctrl/renesas/pinctrl-rzv2m.c @@ -209,6 +209,7 @@ static int rzv2m_map_add_config(struct pinctrl_map *map, static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev, struct device_node *np, + struct device_node *parent, struct pinctrl_map **map, unsigned int *num_maps, unsigned int *index) @@ -226,6 +227,7 @@ static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev, struct property *prop; int ret, gsel, fsel; const char **pin_fn; + const char *name; const char *pin; pinmux = of_find_property(np, "pinmux", NULL); @@ -309,8 +311,19 @@ static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev, psel_val[i] = MUX_FUNC(value); } + if (parent) { + name = devm_kasprintf(pctrl->dev, GFP_KERNEL, "%pOFn.%pOFn", + parent, np); + if (!name) { + ret = -ENOMEM; + goto done; + } + } else { + name = np->name; + } + /* Register a single pin group listing all the pins we read from DT */ - gsel = pinctrl_generic_add_group(pctldev, np->name, pins, num_pinmux, NULL); + gsel = pinctrl_generic_add_group(pctldev, name, pins, num_pinmux, NULL); if (gsel < 0) { ret = gsel; goto done; @@ -320,17 +333,16 @@ static int rzv2m_dt_subnode_to_map(struct pinctrl_dev *pctldev, * Register a single group function where the 'data' is an array PSEL * register values read from DT. */ - pin_fn[0] = np->name; - fsel = pinmux_generic_add_function(pctldev, np->name, pin_fn, 1, - psel_val); + pin_fn[0] = name; + fsel = pinmux_generic_add_function(pctldev, name, pin_fn, 1, psel_val); if (fsel < 0) { ret = fsel; goto remove_group; } maps[idx].type = PIN_MAP_TYPE_MUX_GROUP; - maps[idx].data.mux.group = np->name; - maps[idx].data.mux.function = np->name; + maps[idx].data.mux.group = name; + maps[idx].data.mux.function = name; idx++; dev_dbg(pctrl->dev, "Parsed %pOF with %d pins\n", np, num_pinmux); @@ -377,7 +389,7 @@ static int rzv2m_dt_node_to_map(struct pinctrl_dev *pctldev, index = 0; for_each_child_of_node(np, child) { - ret = rzv2m_dt_subnode_to_map(pctldev, child, map, + ret = rzv2m_dt_subnode_to_map(pctldev, child, np, map, num_maps, &index); if (ret < 0) { of_node_put(child); @@ -386,7 +398,7 @@ static int rzv2m_dt_node_to_map(struct pinctrl_dev *pctldev, } if (*num_maps == 0) { - ret = rzv2m_dt_subnode_to_map(pctldev, np, map, + ret = rzv2m_dt_subnode_to_map(pctldev, np, NULL, map, num_maps, &index); if (ret < 0) goto done; -- cgit v1.2.3 From bfc374a145ae133613e05b9b89be561f169cb58d Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 4 Jul 2023 12:18:58 +0100 Subject: pinctrl: renesas: rzg2l: Handle non-unique subnode names Currently, sd1 and sd0 have unique subnode names 'sd1_mux' and 'sd0_mux'. If we change these to non-unique subnode names such as 'mux' this can lead to the below conflict as the RZ/G2L pin control driver considers only the names of the subnodes. pinctrl-rzg2l 11030000.pinctrl: pin P47_0 already requested by 11c00000.mmc; cannot claim for 11c10000.mmc pinctrl-rzg2l 11030000.pinctrl: pin-376 (11c10000.mmc) status -22 pinctrl-rzg2l 11030000.pinctrl: could not request pin 376 (P47_0) from group mux on device pinctrl-rzg2l renesas_sdhi_internal_dmac 11c10000.mmc: Error applying setting, reverse things back Fix this by constructing unique names from the node names of both the pin control configuration node and its child node, where appropriate. Based on the work done by Geert for the RZ/V2M pinctrl driver. Fixes: c4c4637eb57f ("pinctrl: renesas: Add RZ/G2L pin and gpio controller driver") Signed-off-by: Biju Das Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20230704111858.215278-1-biju.das.jz@bp.renesas.com Signed-off-by: Geert Uytterhoeven --- drivers/pinctrl/renesas/pinctrl-rzg2l.c | 28 ++++++++++++++++++++-------- 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/renesas/pinctrl-rzg2l.c b/drivers/pinctrl/renesas/pinctrl-rzg2l.c index 9511d920565e..b53d26167da5 100644 --- a/drivers/pinctrl/renesas/pinctrl-rzg2l.c +++ b/drivers/pinctrl/renesas/pinctrl-rzg2l.c @@ -249,6 +249,7 @@ static int rzg2l_map_add_config(struct pinctrl_map *map, static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev, struct device_node *np, + struct device_node *parent, struct pinctrl_map **map, unsigned int *num_maps, unsigned int *index) @@ -266,6 +267,7 @@ static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev, struct property *prop; int ret, gsel, fsel; const char **pin_fn; + const char *name; const char *pin; pinmux = of_find_property(np, "pinmux", NULL); @@ -349,8 +351,19 @@ static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev, psel_val[i] = MUX_FUNC(value); } + if (parent) { + name = devm_kasprintf(pctrl->dev, GFP_KERNEL, "%pOFn.%pOFn", + parent, np); + if (!name) { + ret = -ENOMEM; + goto done; + } + } else { + name = np->name; + } + /* Register a single pin group listing all the pins we read from DT */ - gsel = pinctrl_generic_add_group(pctldev, np->name, pins, num_pinmux, NULL); + gsel = pinctrl_generic_add_group(pctldev, name, pins, num_pinmux, NULL); if (gsel < 0) { ret = gsel; goto done; @@ -360,17 +373,16 @@ static int rzg2l_dt_subnode_to_map(struct pinctrl_dev *pctldev, * Register a single group function where the 'data' is an array PSEL * register values read from DT. */ - pin_fn[0] = np->name; - fsel = pinmux_generic_add_function(pctldev, np->name, pin_fn, 1, - psel_val); + pin_fn[0] = name; + fsel = pinmux_generic_add_function(pctldev, name, pin_fn, 1, psel_val); if (fsel < 0) { ret = fsel; goto remove_group; } maps[idx].type = PIN_MAP_TYPE_MUX_GROUP; - maps[idx].data.mux.group = np->name; - maps[idx].data.mux.function = np->name; + maps[idx].data.mux.group = name; + maps[idx].data.mux.function = name; idx++; dev_dbg(pctrl->dev, "Parsed %pOF with %d pins\n", np, num_pinmux); @@ -417,7 +429,7 @@ static int rzg2l_dt_node_to_map(struct pinctrl_dev *pctldev, index = 0; for_each_child_of_node(np, child) { - ret = rzg2l_dt_subnode_to_map(pctldev, child, map, + ret = rzg2l_dt_subnode_to_map(pctldev, child, np, map, num_maps, &index); if (ret < 0) { of_node_put(child); @@ -426,7 +438,7 @@ static int rzg2l_dt_node_to_map(struct pinctrl_dev *pctldev, } if (*num_maps == 0) { - ret = rzg2l_dt_subnode_to_map(pctldev, np, map, + ret = rzg2l_dt_subnode_to_map(pctldev, np, NULL, map, num_maps, &index); if (ret < 0) goto done; -- cgit v1.2.3 From c09168c9392ac9250d87d71fc5ca3156f7456ea4 Mon Sep 17 00:00:00 2001 From: Ryan Wanner Date: Fri, 30 Jun 2023 09:17:00 -0700 Subject: MAINTAINERS: Add myself as a maintainer for Microchip SPI Tudor is not with Microchip anymore. I have worked lately with Microchip SPI drivers replacing Tudor with myself as this maintainer. Signed-off-by: Ryan Wanner Acked-by: Nicolas Ferre Acked-by: Tudor Ambarus Link: https://lore.kernel.org/r/20230630161700.448747-1-Ryan.Wanner@microchip.com Signed-off-by: Mark Brown --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index bc201627c2e0..5afc1b61daa4 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -13861,7 +13861,7 @@ T: git https://git.kernel.org/pub/scm/linux/kernel/git/conor/linux.git/ F: drivers/soc/microchip/ MICROCHIP SPI DRIVER -M: Tudor Ambarus +M: Ryan Wanner S: Supported F: drivers/spi/spi-atmel.* -- cgit v1.2.3 From 5158814cbb37bbb38344b3ecddc24ba2ed0365f2 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Thu, 29 Jun 2023 09:14:52 +0200 Subject: spi: bcm63xx: fix max prepend length The command word is defined as following: /* Command */ #define SPI_CMD_COMMAND_SHIFT 0 #define SPI_CMD_DEVICE_ID_SHIFT 4 #define SPI_CMD_PREPEND_BYTE_CNT_SHIFT 8 #define SPI_CMD_ONE_BYTE_SHIFT 11 #define SPI_CMD_ONE_WIRE_SHIFT 12 If the prepend byte count field starts at bit 8, and the next defined bit is SPI_CMD_ONE_BYTE at bit 11, it can be at most 3 bits wide, and thus the max value is 7, not 15. Fixes: b17de076062a ("spi/bcm63xx: work around inability to keep CS up") Signed-off-by: Jonas Gorski Link: https://lore.kernel.org/r/20230629071453.62024-1-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- drivers/spi/spi-bcm63xx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-bcm63xx.c b/drivers/spi/spi-bcm63xx.c index 9aecb77c3d89..07b5b71b2352 100644 --- a/drivers/spi/spi-bcm63xx.c +++ b/drivers/spi/spi-bcm63xx.c @@ -126,7 +126,7 @@ enum bcm63xx_regs_spi { SPI_MSG_DATA_SIZE, }; -#define BCM63XX_SPI_MAX_PREPEND 15 +#define BCM63XX_SPI_MAX_PREPEND 7 #define BCM63XX_SPI_MAX_CS 8 #define BCM63XX_SPI_BUS_NUM 0 -- cgit v1.2.3 From a2848d08742c8e8494675892c02c0d22acbe3cf8 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 7 Jul 2023 11:25:00 +0200 Subject: drm/ttm: never consider pinned BOs for eviction&swap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a small window where we have already incremented the pin count but not yet moved the bo from the lru to the pinned list. Signed-off-by: Christian König Reported-by: Pelloux-Prayer, Pierre-Eric Tested-by: Pelloux-Prayer, Pierre-Eric Acked-by: Alex Deucher Cc: stable@vger.kernel.org Link: https://patchwork.freedesktop.org/patch/msgid/20230707120826.3701-1-christian.koenig@amd.com --- drivers/gpu/drm/ttm/ttm_bo.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 1a1cfd675cc4..7139a522b2f3 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -517,6 +517,12 @@ static bool ttm_bo_evict_swapout_allowable(struct ttm_buffer_object *bo, { bool ret = false; + if (bo->pin_count) { + *locked = false; + *busy = false; + return false; + } + if (bo->base.resv == ctx->resv) { dma_resv_assert_held(bo->base.resv); if (ctx->allow_res_evict) -- cgit v1.2.3 From df9d70c18616760c6504b97fec66b6379c172dbb Mon Sep 17 00:00:00 2001 From: Bharath SM Date: Fri, 7 Jul 2023 15:29:01 +0000 Subject: cifs: if deferred close is disabled then close files immediately If defer close timeout value is set to 0, then there is no need to include files in the deferred close list and utilize the delayed worker for closing. Instead, we can close them immediately. Signed-off-by: Bharath SM Reviewed-by: Shyam Prasad N Cc: stable@vger.kernel.org Signed-off-by: Steve French --- fs/smb/client/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 879bc8e6555c..fc5acc95cd13 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -1080,8 +1080,8 @@ int cifs_close(struct inode *inode, struct file *file) cfile = file->private_data; file->private_data = NULL; dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); - if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && - cinode->lease_granted && + if ((cifs_sb->ctx->closetimeo && cinode->oplock == CIFS_CACHE_RHW_FLG) + && cinode->lease_granted && !test_bit(CIFS_INO_CLOSE_ON_LOCK, &cinode->flags) && dclose) { if (test_and_clear_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) { -- cgit v1.2.3 From e5bb0988a5b622f58cc53dbdc044562229284d23 Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Tue, 4 Jul 2023 09:36:56 +0200 Subject: nvme: add BOGUS_NID quirk for Samsung SM953 Add the quirk as SM953 is reporting bogus namespace ID. Link: https://bugzilla.kernel.org/show_bug.cgi?id=217593 Reported-by: Clemens Springsguth Tested-by: Clemens Springsguth Signed-off-by: Pankaj Raghav Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 72725729cb6c..8754b4a5c684 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3396,6 +3396,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x144d, 0xa809), /* Samsung MZALQ256HBJD 256G */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, + { PCI_DEVICE(0x144d, 0xa802), /* Samsung SM953 */ + .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1cc4, 0x6303), /* UMIS RPJTJ512MGE1QDY 512G */ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, }, { PCI_DEVICE(0x1cc4, 0x6302), /* UMIS RPJTJ256MGE1QDY 256G */ -- cgit v1.2.3 From 9bcf156f5897e91e21be54960b46774f0682afad Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 7 Jul 2023 09:32:22 +0900 Subject: nvmet: use PAGE_SECTORS_SHIFT Replace occurences of the pattern "PAGE_SHIFT - 9" in the passthru and loop targets with PAGE_SECTORS_SHIFT. Signed-off-by: Damien Le Moal Reviewed-by: Chaitanya Kulkarni Reviewed-by: Sagi Grimberg Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/target/loop.c | 2 +- drivers/nvme/target/passthru.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index f2d24b2d992f..48d5df054cd0 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -373,7 +373,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl) goto out_cleanup_tagset; ctrl->ctrl.max_hw_sectors = - (NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9); + (NVME_LOOP_MAX_SEGMENTS - 1) << PAGE_SECTORS_SHIFT; nvme_unquiesce_admin_queue(&ctrl->ctrl); diff --git a/drivers/nvme/target/passthru.c b/drivers/nvme/target/passthru.c index 71a9c1cc57f5..9fe07d7efa96 100644 --- a/drivers/nvme/target/passthru.c +++ b/drivers/nvme/target/passthru.c @@ -102,14 +102,14 @@ static u16 nvmet_passthru_override_id_ctrl(struct nvmet_req *req) * which depends on the host's memory fragementation. To solve this, * ensure mdts is limited to the pages equal to the number of segments. */ - max_hw_sectors = min_not_zero(pctrl->max_segments << (PAGE_SHIFT - 9), + max_hw_sectors = min_not_zero(pctrl->max_segments << PAGE_SECTORS_SHIFT, pctrl->max_hw_sectors); /* * nvmet_passthru_map_sg is limitted to using a single bio so limit * the mdts based on BIO_MAX_VECS as well */ - max_hw_sectors = min_not_zero(BIO_MAX_VECS << (PAGE_SHIFT - 9), + max_hw_sectors = min_not_zero(BIO_MAX_VECS << PAGE_SECTORS_SHIFT, max_hw_sectors); page_shift = NVME_CAP_MPSMIN(ctrl->cap) + 12; -- cgit v1.2.3 From b938e6603660652dc3db66d3c915fbfed3bce21d Mon Sep 17 00:00:00 2001 From: Ankit Kumar Date: Fri, 23 Jun 2023 18:08:05 +0530 Subject: nvme: fix the NVME_ID_NS_NVM_STS_MASK definition As per NVMe command set specification 1.0c Storage tag size is 7 bits. Fixes: 4020aad85c67 ("nvme: add support for enhanced metadata") Signed-off-by: Ankit Kumar Reviewed-by: Kanchan Joshi Signed-off-by: Keith Busch --- include/linux/nvme.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 182b6d614eb1..26dd3f859d9d 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -473,7 +473,7 @@ struct nvme_id_ns_nvm { }; enum { - NVME_ID_NS_NVM_STS_MASK = 0x3f, + NVME_ID_NS_NVM_STS_MASK = 0x7f, NVME_ID_NS_NVM_GUARD_SHIFT = 7, NVME_ID_NS_NVM_GUARD_MASK = 0x3, }; -- cgit v1.2.3 From dc8cbb65dc17b0daebca84375d35ce54ff730762 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 10 Jul 2023 10:23:05 -0600 Subject: block: remove dead struc request->completion_data field It's no longer used. While in there, also update the comment as to why it can coexist with the rb_node. Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 2b7fb8e87793..b96e00499f9e 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -158,13 +158,13 @@ struct request { /* * The rb_node is only used inside the io scheduler, requests - * are pruned when moved to the dispatch queue. So let the - * completion_data share space with the rb_node. + * are pruned when moved to the dispatch queue. special_vec must + * only be used if RQF_SPECIAL_PAYLOAD is set, and those cannot be + * insert into an IO scheduler. */ union { struct rb_node rb_node; /* sort/lookup */ struct bio_vec special_vec; - void *completion_data; }; /* -- cgit v1.2.3 From c44e783e0b4539cb8296da9229202a9f9e6a3c8d Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 3 Jul 2023 09:57:25 -0700 Subject: xtensa: ISS: add comment about etherdev freeing iss_net_configure explicitly frees etherdev in all error return paths except one where register_netdevice fails. In that remaining error return path the etherdev is freed by the iss_net_pdev_release callback triggered by the platform_device_unregister call. Add a comment stating that. Signed-off-by: Max Filippov --- arch/xtensa/platforms/iss/network.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 9ac46ab3a296..7b97e6ab85a4 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -540,6 +540,7 @@ static void iss_net_configure(int index, char *init) rtnl_unlock(); pr_err("%s: error registering net device!\n", dev->name); platform_device_unregister(&lp->pdev); + /* dev is freed by the iss_net_pdev_release callback */ return; } rtnl_unlock(); -- cgit v1.2.3 From bc8d5916541fa19ca5bc598eb51a5f78eb891a36 Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 3 Jul 2023 11:01:42 -0700 Subject: xtensa: ISS: fix call to split_if_spec split_if_spec expects a NULL-pointer as an end marker for the argument list, but tuntap_probe never supplied that terminating NULL. As a result incorrectly formatted interface specification string may cause a crash because of the random memory access. Fix that by adding NULL terminator to the split_if_spec argument list. Cc: stable@vger.kernel.org Fixes: 7282bee78798 ("[PATCH] xtensa: Architecture support for Tensilica Xtensa Part 8") Signed-off-by: Max Filippov --- arch/xtensa/platforms/iss/network.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/xtensa/platforms/iss/network.c b/arch/xtensa/platforms/iss/network.c index 7b97e6ab85a4..85c82cd42188 100644 --- a/arch/xtensa/platforms/iss/network.c +++ b/arch/xtensa/platforms/iss/network.c @@ -237,7 +237,7 @@ static int tuntap_probe(struct iss_net_private *lp, int index, char *init) init += sizeof(TRANSPORT_TUNTAP_NAME) - 1; if (*init == ',') { - rem = split_if_spec(init + 1, &mac_str, &dev_name); + rem = split_if_spec(init + 1, &mac_str, &dev_name, NULL); if (rem != NULL) { pr_err("%s: extra garbage on specification : '%s'\n", dev->name, rem); -- cgit v1.2.3 From a160e9414d8a1747225206558b24d7df513b3c8d Mon Sep 17 00:00:00 2001 From: Max Filippov Date: Mon, 10 Jul 2023 15:13:27 -0700 Subject: xtensa: fix unaligned and load/store configuration interaction Unaligned exception handler is needed in configurations with hardware support for unaligned access when the load/store exception handler is enabled because such configurations would still raise an exception on unaligned access through the instruction bus. Fixes: f29cf77609cc ("xtensa: add load/store exception handler") Signed-off-by: Max Filippov --- arch/xtensa/kernel/align.S | 34 ++++++++++++++-------------------- arch/xtensa/kernel/traps.c | 3 ++- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/arch/xtensa/kernel/align.S b/arch/xtensa/kernel/align.S index 20d6b4961001..ee97edce2300 100644 --- a/arch/xtensa/kernel/align.S +++ b/arch/xtensa/kernel/align.S @@ -1,7 +1,7 @@ /* * arch/xtensa/kernel/align.S * - * Handle unalignment exceptions in kernel space. + * Handle unalignment and load/store exceptions. * * This file is subject to the terms and conditions of the GNU General * Public License. See the file "COPYING" in the main directory of @@ -26,20 +26,18 @@ #define LOAD_EXCEPTION_HANDLER #endif -#if XCHAL_UNALIGNED_STORE_EXCEPTION || defined LOAD_EXCEPTION_HANDLER +#if XCHAL_UNALIGNED_STORE_EXCEPTION || defined CONFIG_XTENSA_LOAD_STORE +#define STORE_EXCEPTION_HANDLER +#endif + +#if defined LOAD_EXCEPTION_HANDLER || defined STORE_EXCEPTION_HANDLER #define ANY_EXCEPTION_HANDLER #endif -#if XCHAL_HAVE_WINDOWED +#if XCHAL_HAVE_WINDOWED && defined CONFIG_MMU #define UNALIGNED_USER_EXCEPTION #endif -/* First-level exception handler for unaligned exceptions. - * - * Note: This handler works only for kernel exceptions. Unaligned user - * access should get a seg fault. - */ - /* Big and little endian 16-bit values are located in * different halves of a register. HWORD_START helps to * abstract the notion of extracting a 16-bit value from a @@ -228,8 +226,6 @@ ENDPROC(fast_load_store) #ifdef ANY_EXCEPTION_HANDLER ENTRY(fast_unaligned) -#if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION - call0 .Lsave_and_load_instruction /* Analyze the instruction (load or store?). */ @@ -244,8 +240,7 @@ ENTRY(fast_unaligned) /* 'store indicator bit' not set, jump */ _bbci.l a4, OP1_SI_BIT + INSN_OP1, .Lload -#endif -#if XCHAL_UNALIGNED_STORE_EXCEPTION +#ifdef STORE_EXCEPTION_HANDLER /* Store: Jump to table entry to get the value in the source register.*/ @@ -254,7 +249,7 @@ ENTRY(fast_unaligned) addx8 a5, a6, a5 jx a5 # jump into table #endif -#if XCHAL_UNALIGNED_LOAD_EXCEPTION +#ifdef LOAD_EXCEPTION_HANDLER /* Load: Load memory address. */ @@ -328,7 +323,7 @@ ENTRY(fast_unaligned) mov a14, a3 ; _j .Lexit; .align 8 mov a15, a3 ; _j .Lexit; .align 8 #endif -#if XCHAL_UNALIGNED_STORE_EXCEPTION +#ifdef STORE_EXCEPTION_HANDLER .Lstore_table: l32i a3, a2, PT_AREG0; _j .Lstore_w; .align 8 mov a3, a1; _j .Lstore_w; .align 8 # fishy?? @@ -348,7 +343,6 @@ ENTRY(fast_unaligned) mov a3, a15 ; _j .Lstore_w; .align 8 #endif -#ifdef ANY_EXCEPTION_HANDLER /* We cannot handle this exception. */ .extern _kernel_exception @@ -377,8 +371,8 @@ ENTRY(fast_unaligned) 2: movi a0, _user_exception jx a0 -#endif -#if XCHAL_UNALIGNED_STORE_EXCEPTION + +#ifdef STORE_EXCEPTION_HANDLER # a7: instruction pointer, a4: instruction, a3: value .Lstore_w: @@ -444,7 +438,7 @@ ENTRY(fast_unaligned) s32i a6, a4, 4 #endif #endif -#ifdef ANY_EXCEPTION_HANDLER + .Lexit: #if XCHAL_HAVE_LOOPS rsr a4, lend # check if we reached LEND @@ -539,7 +533,7 @@ ENTRY(fast_unaligned) __src_b a4, a4, a5 # a4 has the instruction ret -#endif + ENDPROC(fast_unaligned) ENTRY(fast_unaligned_fixup) diff --git a/arch/xtensa/kernel/traps.c b/arch/xtensa/kernel/traps.c index 17eb180eff7c..427c125a137a 100644 --- a/arch/xtensa/kernel/traps.c +++ b/arch/xtensa/kernel/traps.c @@ -102,7 +102,8 @@ static dispatch_init_table_t __initdata dispatch_init_table[] = { #endif { EXCCAUSE_INTEGER_DIVIDE_BY_ZERO, 0, do_div0 }, /* EXCCAUSE_PRIVILEGED unhandled */ -#if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || XCHAL_UNALIGNED_STORE_EXCEPTION || \ + IS_ENABLED(CONFIG_XTENSA_LOAD_STORE) #ifdef CONFIG_XTENSA_UNALIGNED_USER { EXCCAUSE_UNALIGNED, USER, fast_unaligned }, #endif -- cgit v1.2.3 From 535d0ae39185a266536a1e97ff9a8956d7fbb9df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 11 Jul 2023 10:10:40 +0200 Subject: x86/cfi: Only define poison_cfi() if CONFIG_X86_KERNEL_IBT=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit poison_cfi() was introduced in: 9831c6253ace ("x86/cfi: Extend ENDBR sealing to kCFI") ... but it's only ever used under CONFIG_X86_KERNEL_IBT=y, and if that option is disabled, we get: arch/x86/kernel/alternative.c:1243:13: error: ‘poison_cfi’ defined but not used [-Werror=unused-function] Guard the definition with CONFIG_X86_KERNEL_IBT. Cc: Peter Zijlstra (Intel) Cc: Kees Cook Cc: Sami Tolvanen Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/alternative.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d77aaabf9fe8..2dcf3a06af09 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1240,7 +1240,9 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, { } +#ifdef CONFIG_X86_KERNEL_IBT static void poison_cfi(void *addr) { } +#endif #endif -- cgit v1.2.3 From 5c413188c68da0e4bffc93de1c80257e20741e69 Mon Sep 17 00:00:00 2001 From: Stanislav Lisovskiy Date: Wed, 28 Jun 2023 17:10:17 +0300 Subject: drm/i915: Don't preserve dpll_hw_state for slave crtc in Bigjoiner MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If we are using Bigjoiner dpll_hw_state is supposed to be exactly same as for master crtc, so no need to save it's state for slave crtc. Signed-off-by: Stanislav Lisovskiy Fixes: 0ff0e219d9b8 ("drm/i915: Compute clocks earlier") Reviewed-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20230628141017.18937-1-stanislav.lisovskiy@intel.com (cherry picked from commit cbaf758809952c95ec00e796695049babb08bb60) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/display/intel_display.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/gpu/drm/i915/display/intel_display.c b/drivers/gpu/drm/i915/display/intel_display.c index d8533603ad05..16603d591f56 100644 --- a/drivers/gpu/drm/i915/display/intel_display.c +++ b/drivers/gpu/drm/i915/display/intel_display.c @@ -4564,7 +4564,6 @@ copy_bigjoiner_crtc_state_modeset(struct intel_atomic_state *state, saved_state->uapi = slave_crtc_state->uapi; saved_state->scaler_state = slave_crtc_state->scaler_state; saved_state->shared_dpll = slave_crtc_state->shared_dpll; - saved_state->dpll_hw_state = slave_crtc_state->dpll_hw_state; saved_state->crc_enabled = slave_crtc_state->crc_enabled; intel_crtc_free_hw_state(slave_crtc_state); -- cgit v1.2.3 From dde4c3d477d834212947f38519407df404acde4a Mon Sep 17 00:00:00 2001 From: Umesh Nerlige Ramappa Date: Fri, 16 Jun 2023 10:34:02 -0700 Subject: drm/i915/perf: Consider OA buffer boundary when zeroing out reports For reports that are not powers of 2, reports at the end of the OA buffer may get split across the buffer boundary. When zeroing out such reports, take the split into consideration. v2: Use OA_BUFFER_SIZE (Ashutosh) Fixes: 09a36015d9a0 ("drm/i915/perf: Clear out entire reports after reading if not power of 2 size") Signed-off-by: Umesh Nerlige Ramappa Reviewed-by: Ashutosh Dixit Signed-off-by: Ashutosh Dixit Link: https://patchwork.freedesktop.org/patch/msgid/20230616173402.699776-1-umesh.nerlige.ramappa@intel.com (cherry picked from commit 40b1588a750240cbe8a83117aa785d778749a77c) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/i915_perf.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 0a111b281578..7413c11fb562 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -868,8 +868,17 @@ static int gen8_append_oa_reports(struct i915_perf_stream *stream, oa_report_id_clear(stream, report32); oa_timestamp_clear(stream, report32); } else { + u8 *oa_buf_end = stream->oa_buffer.vaddr + + OA_BUFFER_SIZE; + u32 part = oa_buf_end - (u8 *)report32; + /* Zero out the entire report */ - memset(report32, 0, report_size); + if (report_size <= part) { + memset(report32, 0, report_size); + } else { + memset(report32, 0, part); + memset(oa_buf_base, 0, report_size - part); + } } } -- cgit v1.2.3 From 6bf0961a008ac74b085f1690fba8520ac3b253ee Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 7 Jul 2023 13:46:44 +0100 Subject: drm/i915: Remove dead code from gen8_pte_encode Commit 9275277d5324 ("drm/i915: use pat_index instead of cache_level") added a dedicated gen12_pte_encode but forgot to remove the Gen12 specific bit from gen8_pte_encode. Signed-off-by: Tvrtko Ursulin Fixes: 9275277d5324 ("drm/i915: use pat_index instead of cache_level") Cc: Fei Yang Cc: Andi Shyti Cc: Matt Roper Reviewed-by: Rodrigo Vivi Reviewed-by: Fei Yang Link: https://patchwork.freedesktop.org/patch/msgid/20230707124644.3965281-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit 08509377dd82ead98429785509f6b52a4b5f09f5) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/gen8_ppgtt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c index f948d33e5ec5..c8568e5d1147 100644 --- a/drivers/gpu/drm/i915/gt/gen8_ppgtt.c +++ b/drivers/gpu/drm/i915/gt/gen8_ppgtt.c @@ -37,9 +37,6 @@ static u64 gen8_pte_encode(dma_addr_t addr, if (unlikely(flags & PTE_READ_ONLY)) pte &= ~GEN8_PAGE_RW; - if (flags & PTE_LM) - pte |= GEN12_PPGTT_PTE_LM; - /* * For pre-gen12 platforms pat_index is the same as enum * i915_cache_level, so the switch-case here is still valid. -- cgit v1.2.3 From 113899c2669dff148b2a5bea4780123811aecc13 Mon Sep 17 00:00:00 2001 From: Tvrtko Ursulin Date: Fri, 7 Jul 2023 13:55:03 +0100 Subject: drm/i915: Fix one wrong caching mode enum usage Commit a4d86249c773 ("drm/i915/gt: Provide a utility to create a scratch buffer") mistakenly passed in uapi I915_CACHING_CACHED as argument to i915_gem_object_set_cache_coherency(), which actually takes internal enum i915_cache_level. No functional issue since the value matches I915_CACHE_LLC (1 == 1), which is the intended caching mode, but lets clean it up nevertheless. Signed-off-by: Tvrtko Ursulin Fixes: a4d86249c773 ("drm/i915/gt: Provide a utility to create a scratch buffer") Cc: Daniele Ceraolo Spurio Reviewed-by: Tejas Upadhyay Link: https://patchwork.freedesktop.org/patch/msgid/20230707125503.3965817-1-tvrtko.ursulin@linux.intel.com (cherry picked from commit 49c60b2f0867ac36fd54d513882a48431aeccae7) Signed-off-by: Tvrtko Ursulin --- drivers/gpu/drm/i915/gt/intel_gtt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gtt.c b/drivers/gpu/drm/i915/gt/intel_gtt.c index 2f6a9be0ffe6..731d9f2bbc56 100644 --- a/drivers/gpu/drm/i915/gt/intel_gtt.c +++ b/drivers/gpu/drm/i915/gt/intel_gtt.c @@ -670,7 +670,7 @@ __vm_create_scratch_for_read(struct i915_address_space *vm, unsigned long size) if (IS_ERR(obj)) return ERR_CAST(obj); - i915_gem_object_set_cache_coherency(obj, I915_CACHING_CACHED); + i915_gem_object_set_cache_coherency(obj, I915_CACHE_LLC); vma = i915_vma_instance(obj, vm, NULL); if (IS_ERR(vma)) { -- cgit v1.2.3 From 27655b9bb9f0d9c32b8de8bec649b676898c52d5 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Mon, 10 Jul 2023 11:10:17 +0200 Subject: drm/client: Send hotplug event after registering a client MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Generate a hotplug event after registering a client to allow the client to configure its display. Remove the hotplug calls from the existing clients for fbdev emulation. This change fixes a concurrency bug between registering a client and receiving events from the DRM core. The bug is present in the fbdev emulation of all drivers. The fbdev emulation currently generates a hotplug event before registering the client to the device. For each new output, the DRM core sends an additional hotplug event to each registered client. If the DRM core detects first output between sending the artificial hotplug and registering the device, the output's hotplug event gets lost. If this is the first output, the fbdev console display remains dark. This has been observed with amdgpu and fbdev-generic. Fix this by adding hotplug generation directly to the client's register helper drm_client_register(). Registering the client and receiving events are serialized by struct drm_device.clientlist_mutex. So an output is either configured by the initial hotplug event, or the client has already been registered. The bug was originally added in commit 6e3f17ee73f7 ("drm/fb-helper: generic: Call drm_client_add() after setup is done"), in which adding a client and receiving a hotplug event switched order. It was hidden, as most hardware and drivers have at least on static output configured. Other drivers didn't use the internal DRM client or still had struct drm_mode_config_funcs.output_poll_changed set. That callback handled hotplug events as well. After not setting the callback in amdgpu in commit 0e3172bac3f4 ("drm/amdgpu: Don't set struct drm_driver.output_poll_changed"), amdgpu did not show a framebuffer console if output events got lost. The bug got copy-pasted from fbdev-generic into the other fbdev emulation. Reported-by: Moritz Duge Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2649 Fixes: 6e3f17ee73f7 ("drm/fb-helper: generic: Call drm_client_add() after setup is done") Fixes: 8ab59da26bc0 ("drm/fb-helper: Move generic fbdev emulation into separate source file") Fixes: b79fe9abd58b ("drm/fbdev-dma: Implement fbdev emulation for GEM DMA helpers") Fixes: 63c381552f69 ("drm/armada: Implement fbdev emulation as in-kernel client") Fixes: 49953b70e7d3 ("drm/exynos: Implement fbdev emulation as in-kernel client") Fixes: 8f1aaccb04b7 ("drm/gma500: Implement client-based fbdev emulation") Fixes: 940b869c2f2f ("drm/msm: Implement fbdev emulation as in-kernel client") Fixes: 9e69bcd88e45 ("drm/omapdrm: Implement fbdev emulation as in-kernel client") Fixes: e317a69fe891 ("drm/radeon: Implement client-based fbdev emulation") Fixes: 71ec16f45ef8 ("drm/tegra: Implement fbdev emulation as in-kernel client") Fixes: 0e3172bac3f4 ("drm/amdgpu: Don't set struct drm_driver.output_poll_changed") Signed-off-by: Thomas Zimmermann Tested-by: Moritz Duge Tested-by: Torsten Krah Tested-by: Paul Schyska Cc: Daniel Vetter Cc: David Airlie Cc: Noralf Trønnes Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Javier Martinez Canillas Cc: Russell King Cc: Inki Dae Cc: Seung-Woo Kim Cc: Kyungmin Park Cc: Krzysztof Kozlowski Cc: Patrik Jakobsson Cc: Rob Clark Cc: Abhinav Kumar Cc: Dmitry Baryshkov Cc: Tomi Valkeinen Cc: Alex Deucher Cc: "Christian König" Cc: "Pan, Xinhui" Cc: Thierry Reding Cc: Mikko Perttunen Cc: dri-devel@lists.freedesktop.org Cc: linux-kernel@vger.kernel.org Cc: linux-arm-kernel@lists.infradead.org Cc: linux-samsung-soc@vger.kernel.org Cc: linux-arm-msm@vger.kernel.org Cc: freedreno@lists.freedesktop.org Cc: amd-gfx@lists.freedesktop.org Cc: linux-tegra@vger.kernel.org Cc: dri-devel@lists.freedesktop.org Cc: # v5.2+ Reviewed-by: Javier Martinez Canillas Reviewed-by: Dmitry Baryshkov # msm Link: https://patchwork.freedesktop.org/patch/msgid/20230710091029.27503-1-tzimmermann@suse.de --- drivers/gpu/drm/armada/armada_fbdev.c | 4 ---- drivers/gpu/drm/drm_client.c | 21 +++++++++++++++++++++ drivers/gpu/drm/drm_fbdev_dma.c | 4 ---- drivers/gpu/drm/drm_fbdev_generic.c | 4 ---- drivers/gpu/drm/exynos/exynos_drm_fbdev.c | 4 ---- drivers/gpu/drm/gma500/fbdev.c | 4 ---- drivers/gpu/drm/msm/msm_fbdev.c | 4 ---- drivers/gpu/drm/omapdrm/omap_fbdev.c | 4 ---- drivers/gpu/drm/radeon/radeon_fbdev.c | 4 ---- drivers/gpu/drm/tegra/fbdev.c | 4 ---- 10 files changed, 21 insertions(+), 36 deletions(-) diff --git a/drivers/gpu/drm/armada/armada_fbdev.c b/drivers/gpu/drm/armada/armada_fbdev.c index 3943e89cc06c..e40a95e51785 100644 --- a/drivers/gpu/drm/armada/armada_fbdev.c +++ b/drivers/gpu/drm/armada/armada_fbdev.c @@ -209,10 +209,6 @@ void armada_fbdev_setup(struct drm_device *dev) goto err_drm_client_init; } - ret = armada_fbdev_client_hotplug(&fbh->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&fbh->client); return; diff --git a/drivers/gpu/drm/drm_client.c b/drivers/gpu/drm/drm_client.c index f6292ba0e6fc..037e36f2049c 100644 --- a/drivers/gpu/drm/drm_client.c +++ b/drivers/gpu/drm/drm_client.c @@ -122,13 +122,34 @@ EXPORT_SYMBOL(drm_client_init); * drm_client_register() it is no longer permissible to call drm_client_release() * directly (outside the unregister callback), instead cleanup will happen * automatically on driver unload. + * + * Registering a client generates a hotplug event that allows the client + * to set up its display from pre-existing outputs. The client must have + * initialized its state to able to handle the hotplug event successfully. */ void drm_client_register(struct drm_client_dev *client) { struct drm_device *dev = client->dev; + int ret; mutex_lock(&dev->clientlist_mutex); list_add(&client->list, &dev->clientlist); + + if (client->funcs && client->funcs->hotplug) { + /* + * Perform an initial hotplug event to pick up the + * display configuration for the client. This step + * has to be performed *after* registering the client + * in the list of clients, or a concurrent hotplug + * event might be lost; leaving the display off. + * + * Hold the clientlist_mutex as for a regular hotplug + * event. + */ + ret = client->funcs->hotplug(client); + if (ret) + drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); + } mutex_unlock(&dev->clientlist_mutex); } EXPORT_SYMBOL(drm_client_register); diff --git a/drivers/gpu/drm/drm_fbdev_dma.c b/drivers/gpu/drm/drm_fbdev_dma.c index 76aa52b38a13..f353daff65e1 100644 --- a/drivers/gpu/drm/drm_fbdev_dma.c +++ b/drivers/gpu/drm/drm_fbdev_dma.c @@ -252,10 +252,6 @@ void drm_fbdev_dma_setup(struct drm_device *dev, unsigned int preferred_bpp) goto err_drm_client_init; } - ret = drm_fbdev_dma_client_hotplug(&fb_helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&fb_helper->client); return; diff --git a/drivers/gpu/drm/drm_fbdev_generic.c b/drivers/gpu/drm/drm_fbdev_generic.c index 98ae703848a0..b9343fb6cf13 100644 --- a/drivers/gpu/drm/drm_fbdev_generic.c +++ b/drivers/gpu/drm/drm_fbdev_generic.c @@ -339,10 +339,6 @@ void drm_fbdev_generic_setup(struct drm_device *dev, unsigned int preferred_bpp) goto err_drm_client_init; } - ret = drm_fbdev_generic_client_hotplug(&fb_helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&fb_helper->client); return; diff --git a/drivers/gpu/drm/exynos/exynos_drm_fbdev.c b/drivers/gpu/drm/exynos/exynos_drm_fbdev.c index fdf65587f1fe..226310c765d8 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fbdev.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fbdev.c @@ -215,10 +215,6 @@ void exynos_drm_fbdev_setup(struct drm_device *dev) if (ret) goto err_drm_client_init; - ret = exynos_drm_fbdev_client_hotplug(&fb_helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&fb_helper->client); return; diff --git a/drivers/gpu/drm/gma500/fbdev.c b/drivers/gpu/drm/gma500/fbdev.c index 955cbe9f05a7..054426549fc6 100644 --- a/drivers/gpu/drm/gma500/fbdev.c +++ b/drivers/gpu/drm/gma500/fbdev.c @@ -328,10 +328,6 @@ void psb_fbdev_setup(struct drm_psb_private *dev_priv) goto err_drm_fb_helper_unprepare; } - ret = psb_fbdev_client_hotplug(&fb_helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&fb_helper->client); return; diff --git a/drivers/gpu/drm/msm/msm_fbdev.c b/drivers/gpu/drm/msm/msm_fbdev.c index b933a85420f6..bf1e17dc4550 100644 --- a/drivers/gpu/drm/msm/msm_fbdev.c +++ b/drivers/gpu/drm/msm/msm_fbdev.c @@ -246,10 +246,6 @@ void msm_fbdev_setup(struct drm_device *dev) goto err_drm_fb_helper_unprepare; } - ret = msm_fbdev_client_hotplug(&helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&helper->client); return; diff --git a/drivers/gpu/drm/omapdrm/omap_fbdev.c b/drivers/gpu/drm/omapdrm/omap_fbdev.c index b7ccce0704a3..fe6639c1cdf3 100644 --- a/drivers/gpu/drm/omapdrm/omap_fbdev.c +++ b/drivers/gpu/drm/omapdrm/omap_fbdev.c @@ -318,10 +318,6 @@ void omap_fbdev_setup(struct drm_device *dev) INIT_WORK(&fbdev->work, pan_worker); - ret = omap_fbdev_client_hotplug(&helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&helper->client); return; diff --git a/drivers/gpu/drm/radeon/radeon_fbdev.c b/drivers/gpu/drm/radeon/radeon_fbdev.c index ab9c1abbac97..f941e2e7cae6 100644 --- a/drivers/gpu/drm/radeon/radeon_fbdev.c +++ b/drivers/gpu/drm/radeon/radeon_fbdev.c @@ -383,10 +383,6 @@ void radeon_fbdev_setup(struct radeon_device *rdev) goto err_drm_client_init; } - ret = radeon_fbdev_client_hotplug(&fb_helper->client); - if (ret) - drm_dbg_kms(rdev->ddev, "client hotplug ret=%d\n", ret); - drm_client_register(&fb_helper->client); return; diff --git a/drivers/gpu/drm/tegra/fbdev.c b/drivers/gpu/drm/tegra/fbdev.c index e74d9be981c7..d042234e1807 100644 --- a/drivers/gpu/drm/tegra/fbdev.c +++ b/drivers/gpu/drm/tegra/fbdev.c @@ -225,10 +225,6 @@ void tegra_fbdev_setup(struct drm_device *dev) if (ret) goto err_drm_client_init; - ret = tegra_fbdev_client_hotplug(&helper->client); - if (ret) - drm_dbg_kms(dev, "client hotplug ret=%d\n", ret); - drm_client_register(&helper->client); return; -- cgit v1.2.3 From 56cbeacf143530576905623ac72ae0964f3293a6 Mon Sep 17 00:00:00 2001 From: Georg Müller Date: Wed, 28 Jun 2023 10:45:50 +0200 Subject: perf probe: Add test for regression introduced by switch to die_get_decl_file() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds a test to validate that 'perf probe' works for binaries where DWARF info is split into multiple CUs Signed-off-by: Georg Müller Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: regressions@lists.linux.dev Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230628084551.1860532-5-georgmueller@gmx.net Signed-off-by: Arnaldo Carvalho de Melo --- .../tests/shell/test_uprobe_from_different_cu.sh | 77 ++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100755 tools/perf/tests/shell/test_uprobe_from_different_cu.sh diff --git a/tools/perf/tests/shell/test_uprobe_from_different_cu.sh b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh new file mode 100755 index 000000000000..00d2e0e2e0c2 --- /dev/null +++ b/tools/perf/tests/shell/test_uprobe_from_different_cu.sh @@ -0,0 +1,77 @@ +#!/bin/bash +# test perf probe of function from different CU +# SPDX-License-Identifier: GPL-2.0 + +set -e + +temp_dir=$(mktemp -d /tmp/perf-uprobe-different-cu-sh.XXXXXXXXXX) + +cleanup() +{ + trap - EXIT TERM INT + if [[ "${temp_dir}" =~ ^/tmp/perf-uprobe-different-cu-sh.*$ ]]; then + echo "--- Cleaning up ---" + perf probe -x ${temp_dir}/testfile -d foo + rm -f "${temp_dir}/"* + rmdir "${temp_dir}" + fi +} + +trap_cleanup() +{ + cleanup + exit 1 +} + +trap trap_cleanup EXIT TERM INT + +cat > ${temp_dir}/testfile-foo.h << EOF +struct t +{ + int *p; + int c; +}; + +extern int foo (int i, struct t *t); +EOF + +cat > ${temp_dir}/testfile-foo.c << EOF +#include "testfile-foo.h" + +int +foo (int i, struct t *t) +{ + int j, res = 0; + for (j = 0; j < i && j < t->c; j++) + res += t->p[j]; + + return res; +} +EOF + +cat > ${temp_dir}/testfile-main.c << EOF +#include "testfile-foo.h" + +static struct t g; + +int +main (int argc, char **argv) +{ + int i; + int j[argc]; + g.c = argc; + g.p = j; + for (i = 0; i < argc; i++) + j[i] = (int) argv[i][0]; + return foo (3, &g); +} +EOF + +gcc -g -Og -flto -c ${temp_dir}/testfile-foo.c -o ${temp_dir}/testfile-foo.o +gcc -g -Og -c ${temp_dir}/testfile-main.c -o ${temp_dir}/testfile-main.o +gcc -g -Og -o ${temp_dir}/testfile ${temp_dir}/testfile-foo.o ${temp_dir}/testfile-main.o + +perf probe -x ${temp_dir}/testfile --funcs foo +perf probe -x ${temp_dir}/testfile foo + +cleanup -- cgit v1.2.3 From c66e1c68c13b872505f25ab641c44b77313ee7fe Mon Sep 17 00:00:00 2001 From: Georg Müller Date: Wed, 28 Jun 2023 10:45:51 +0200 Subject: perf probe: Read DWARF files from the correct CU MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After switching from dwarf_decl_file() to die_get_decl_file(), it is not possible to add probes for certain functions: $ perf probe -x /usr/lib/systemd/systemd-logind match_unit_removed A function DIE doesn't have decl_line. Maybe broken DWARF? A function DIE doesn't have decl_line. Maybe broken DWARF? Probe point 'match_unit_removed' not found. Error: Failed to add events. The problem is that die_get_decl_file() uses the wrong CU to search for the file. elfutils commit e1db5cdc9f has some good explanation for this: dwarf_decl_file uses dwarf_attr_integrate to get the DW_AT_decl_file attribute. This means the attribute might come from a different DIE in a different CU. If so, we need to use the CU associated with the attribute, not the original DIE, to resolve the file name. This patch uses the same source of information as elfutils: use attribute DW_AT_decl_file and use this CU to search for the file. Fixes: dc9a5d2ccd5c823c ("perf probe: Fix to get declared file name from clang DWARF5") Signed-off-by: Georg Müller Acked-by: Masami Hiramatsu (Google) Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: regressions@lists.linux.dev Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20230628084551.1860532-6-georgmueller@gmx.net Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/dwarf-aux.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 45e018c0ebf5..2941d88f2199 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -478,8 +478,10 @@ static const char *die_get_file_name(Dwarf_Die *dw_die, int idx) { Dwarf_Die cu_die; Dwarf_Files *files; + Dwarf_Attribute attr_mem; - if (idx < 0 || !dwarf_diecu(dw_die, &cu_die, NULL, NULL) || + if (idx < 0 || !dwarf_attr_integrate(dw_die, DW_AT_decl_file, &attr_mem) || + !dwarf_cu_die(attr_mem.cu, &cu_die, NULL, NULL, NULL, NULL, NULL, NULL) || dwarf_getsrcfiles(&cu_die, &files, NULL) != 0) return NULL; -- cgit v1.2.3 From 142256d2f41af6f7a9dbbe7db49eecc70858b1f7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jul 2023 09:53:10 -0300 Subject: tools headers UAPI: Sync drm/i915_drm.h with the kernel sources 81b1b599dfd71c95 ("drm/i915: Allow user to set cache at BO creation") 98d2722a85c4ad5f ("drm/i915/huc: differentiate the 2 steps of the MTL HuC auth flow") bc4be0a38b63b6d4 ("drm/i915/pmu: Prepare for multi-tile non-engine counters") d1da138f245d4fb4 ("drm/i915/uapi/pxp: Add a GET_PARAM for PXP") That adds some ioctls but use the __I915_PMU_OTHER() macro, not supported yet in the tools/perf/trace/beauty/drm_ioctl.sh conversion script. This silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/drm/i915_drm.h include/uapi/drm/i915_drm.h Cc: Adrian Hunter Cc: Alan Previn Cc: Andi Shyti Cc: Daniele Ceraolo Spurio Cc: Fei Yang Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Radhakrishna Sripada Cc: Tvrtko Ursulin Cc: Umesh Nerlige Ramappa Link: https://lore.kernel.org/lkml/ZK1R%2FIyWcUKYQbQV@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/drm/i915_drm.h | 95 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 93 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/drm/i915_drm.h b/tools/include/uapi/drm/i915_drm.h index dba7c5a5b25e..7000e5910a1d 100644 --- a/tools/include/uapi/drm/i915_drm.h +++ b/tools/include/uapi/drm/i915_drm.h @@ -280,7 +280,16 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_ENGINE_SEMA(class, instance) \ __I915_PMU_ENGINE(class, instance, I915_SAMPLE_SEMA) -#define __I915_PMU_OTHER(x) (__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) +/* + * Top 4 bits of every non-engine counter are GT id. + */ +#define __I915_PMU_GT_SHIFT (60) + +#define ___I915_PMU_OTHER(gt, x) \ + (((__u64)__I915_PMU_ENGINE(0xff, 0xff, 0xf) + 1 + (x)) | \ + ((__u64)(gt) << __I915_PMU_GT_SHIFT)) + +#define __I915_PMU_OTHER(x) ___I915_PMU_OTHER(0, x) #define I915_PMU_ACTUAL_FREQUENCY __I915_PMU_OTHER(0) #define I915_PMU_REQUESTED_FREQUENCY __I915_PMU_OTHER(1) @@ -290,6 +299,12 @@ enum drm_i915_pmu_engine_sample { #define I915_PMU_LAST /* Deprecated - do not use */ I915_PMU_RC6_RESIDENCY +#define __I915_PMU_ACTUAL_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 0) +#define __I915_PMU_REQUESTED_FREQUENCY(gt) ___I915_PMU_OTHER(gt, 1) +#define __I915_PMU_INTERRUPTS(gt) ___I915_PMU_OTHER(gt, 2) +#define __I915_PMU_RC6_RESIDENCY(gt) ___I915_PMU_OTHER(gt, 3) +#define __I915_PMU_SOFTWARE_GT_AWAKE_TIME(gt) ___I915_PMU_OTHER(gt, 4) + /* Each region is a minimum of 16k, and there are at most 255 of them. */ #define I915_NR_TEX_REGIONS 255 /* table size 2k - maximum due to use @@ -659,7 +674,8 @@ typedef struct drm_i915_irq_wait { * If the IOCTL is successful, the returned parameter will be set to one of the * following values: * * 0 if HuC firmware load is not complete, - * * 1 if HuC firmware is authenticated and running. + * * 1 if HuC firmware is loaded and fully authenticated, + * * 2 if HuC firmware is loaded and authenticated for clear media only */ #define I915_PARAM_HUC_STATUS 42 @@ -771,6 +787,25 @@ typedef struct drm_i915_irq_wait { */ #define I915_PARAM_OA_TIMESTAMP_FREQUENCY 57 +/* + * Query the status of PXP support in i915. + * + * The query can fail in the following scenarios with the listed error codes: + * -ENODEV = PXP support is not available on the GPU device or in the + * kernel due to missing component drivers or kernel configs. + * + * If the IOCTL is successful, the returned parameter will be set to one of + * the following values: + * 1 = PXP feature is supported and is ready for use. + * 2 = PXP feature is supported but should be ready soon (pending + * initialization of non-i915 system dependencies). + * + * NOTE: When param is supported (positive return values), user space should + * still refer to the GEM PXP context-creation UAPI header specs to be + * aware of possible failure due to system state machine at the time. + */ +#define I915_PARAM_PXP_STATUS 58 + /* Must be kept compact -- no holes and well documented */ /** @@ -2096,6 +2131,21 @@ struct drm_i915_gem_context_param { * * -ENODEV: feature not available * -EPERM: trying to mark a recoverable or not bannable context as protected + * -ENXIO: A dependency such as a component driver or firmware is not yet + * loaded so user space may need to attempt again. Depending on the + * device, this error may be reported if protected context creation is + * attempted very early after kernel start because the internal timeout + * waiting for such dependencies is not guaranteed to be larger than + * required (numbers differ depending on system and kernel config): + * - ADL/RPL: dependencies may take up to 3 seconds from kernel start + * while context creation internal timeout is 250 milisecs + * - MTL: dependencies may take up to 8 seconds from kernel start + * while context creation internal timeout is 250 milisecs + * NOTE: such dependencies happen once, so a subsequent call to create a + * protected context after a prior successful call will not experience + * such timeouts and will not return -ENXIO (unless the driver is reloaded, + * or, depending on the device, resumes from a suspended state). + * -EIO: The firmware did not succeed in creating the protected context. */ #define I915_CONTEXT_PARAM_PROTECTED_CONTENT 0xd /* Must be kept compact -- no holes and well documented */ @@ -3630,9 +3680,13 @@ struct drm_i915_gem_create_ext { * * For I915_GEM_CREATE_EXT_PROTECTED_CONTENT usage see * struct drm_i915_gem_create_ext_protected_content. + * + * For I915_GEM_CREATE_EXT_SET_PAT usage see + * struct drm_i915_gem_create_ext_set_pat. */ #define I915_GEM_CREATE_EXT_MEMORY_REGIONS 0 #define I915_GEM_CREATE_EXT_PROTECTED_CONTENT 1 +#define I915_GEM_CREATE_EXT_SET_PAT 2 __u64 extensions; }; @@ -3747,6 +3801,43 @@ struct drm_i915_gem_create_ext_protected_content { __u32 flags; }; +/** + * struct drm_i915_gem_create_ext_set_pat - The + * I915_GEM_CREATE_EXT_SET_PAT extension. + * + * If this extension is provided, the specified caching policy (PAT index) is + * applied to the buffer object. + * + * Below is an example on how to create an object with specific caching policy: + * + * .. code-block:: C + * + * struct drm_i915_gem_create_ext_set_pat set_pat_ext = { + * .base = { .name = I915_GEM_CREATE_EXT_SET_PAT }, + * .pat_index = 0, + * }; + * struct drm_i915_gem_create_ext create_ext = { + * .size = PAGE_SIZE, + * .extensions = (uintptr_t)&set_pat_ext, + * }; + * + * int err = ioctl(fd, DRM_IOCTL_I915_GEM_CREATE_EXT, &create_ext); + * if (err) ... + */ +struct drm_i915_gem_create_ext_set_pat { + /** @base: Extension link. See struct i915_user_extension. */ + struct i915_user_extension base; + /** + * @pat_index: PAT index to be set + * PAT index is a bit field in Page Table Entry to control caching + * behaviors for GPU accesses. The definition of PAT index is + * platform dependent and can be found in hardware specifications, + */ + __u32 pat_index; + /** @rsvd: reserved for future use */ + __u32 rsvd; +}; + /* ID of the protected content session managed by i915 when PXP is active */ #define I915_PROTECTED_CONTENT_DEFAULT_SESSION 0xf -- cgit v1.2.3 From 9350a917913321c69edbfdfd204e0d2855cddd06 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jul 2023 10:11:12 -0300 Subject: tools headers UAPI: Sync files changed by new cachestat syscall with the kernel sources To pick the changes in these csets: cf264e1329fb0307 ("cachestat: implement cachestat syscall") That add support for this new syscall in tools such as 'perf trace'. For instance, this is now possible: # perf trace -e cachestat ^C[root@five ~]# # perf trace -v -e cachestat Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 3163687 && common_pid != 3147) && (id == 451) mmap size 528384B ^C[root@five ~] # perf trace -v -e *stat* --max-events=10 Using CPUID AuthenticAMD-25-21-0 event qualifier tracepoint filter: (common_pid != 3163713 && common_pid != 3147) && (id == 4 || id == 5 || id == 6 || id == 136 || id == 137 || id == 138 || id == 262 || id == 332 || id == 451) mmap size 528384B 0.000 ( 0.009 ms): Cache2 I/O/4544 statfs(pathname: 0x45635288, buf: 0x7f8745725b60) = 0 0.012 ( 0.003 ms): Cache2 I/O/4544 newfstatat(dfd: CWD, filename: 0x45635288, statbuf: 0x7f874569d250) = 0 0.036 ( 0.002 ms): Cache2 I/O/4544 newfstatat(dfd: 138, filename: 0x541b7093, statbuf: 0x7f87457256f0, flag: 4096) = 0 0.372 ( 0.006 ms): Cache2 I/O/4544 statfs(pathname: 0x45635288, buf: 0x7f8745725b10) = 0 0.379 ( 0.003 ms): Cache2 I/O/4544 newfstatat(dfd: CWD, filename: 0x45635288, statbuf: 0x7f874569d250) = 0 0.390 ( 0.002 ms): Cache2 I/O/4544 newfstatat(dfd: 138, filename: 0x541b7093, statbuf: 0x7f87457256a0, flag: 4096) = 0 0.609 ( 0.005 ms): Cache2 I/O/4544 statfs(pathname: 0x45635288, buf: 0x7f8745725b60) = 0 0.615 ( 0.003 ms): Cache2 I/O/4544 newfstatat(dfd: CWD, filename: 0x45635288, statbuf: 0x7f874569d250) = 0 0.625 ( 0.002 ms): Cache2 I/O/4544 newfstatat(dfd: 138, filename: 0x541b7093, statbuf: 0x7f87457256f0, flag: 4096) = 0 0.826 ( 0.005 ms): Cache2 I/O/4544 statfs(pathname: 0x45635288, buf: 0x7f8745725b10) = 0 # That is the filter expression attached to the raw_syscalls:sys_{enter,exit} tracepoints. $ find tools/perf/arch/ -name "syscall*tbl" | xargs grep -w sys_cachestat tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl:451 n64 cachestat sys_cachestat tools/perf/arch/powerpc/entry/syscalls/syscall.tbl:451 common cachestat sys_cachestat tools/perf/arch/s390/entry/syscalls/syscall.tbl:451 common cachestat sys_cachestat sys_cachestat tools/perf/arch/x86/entry/syscalls/syscall_64.tbl:451 common cachestat sys_cachestat $ $ grep -w cachestat /tmp/build/perf-tools/arch/x86/include/generated/asm/syscalls_64.c [451] = "cachestat", $ This addresses these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/asm-generic/unistd.h include/uapi/asm-generic/unistd.h diff -u tools/include/uapi/linux/mman.h include/uapi/linux/mman.h diff -u tools/perf/arch/x86/entry/syscalls/syscall_64.tbl arch/x86/entry/syscalls/syscall_64.tbl diff -u tools/perf/arch/powerpc/entry/syscalls/syscall.tbl arch/powerpc/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/s390/entry/syscalls/syscall.tbl arch/s390/kernel/syscalls/syscall.tbl diff -u tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl arch/mips/kernel/syscalls/syscall_n64.tbl Cc: Adrian Hunter Cc: Andrew Morton Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Nhat Pham Link: https://lore.kernel.org/lkml/ZK1pVBJpbjujJNJW@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/asm-generic/unistd.h | 5 ++++- tools/include/uapi/linux/mman.h | 14 ++++++++++++++ tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl | 1 + tools/perf/arch/powerpc/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/s390/entry/syscalls/syscall.tbl | 1 + tools/perf/arch/x86/entry/syscalls/syscall_64.tbl | 1 + 6 files changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/asm-generic/unistd.h b/tools/include/uapi/asm-generic/unistd.h index dd7d8e10f16d..fd6c1cb585db 100644 --- a/tools/include/uapi/asm-generic/unistd.h +++ b/tools/include/uapi/asm-generic/unistd.h @@ -817,8 +817,11 @@ __SYSCALL(__NR_futex_waitv, sys_futex_waitv) #define __NR_set_mempolicy_home_node 450 __SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) +#define __NR_cachestat 451 +__SYSCALL(__NR_cachestat, sys_cachestat) + #undef __NR_syscalls -#define __NR_syscalls 451 +#define __NR_syscalls 452 /* * 32 bit systems traditionally used different diff --git a/tools/include/uapi/linux/mman.h b/tools/include/uapi/linux/mman.h index f55bc680b5b0..a246e11988d5 100644 --- a/tools/include/uapi/linux/mman.h +++ b/tools/include/uapi/linux/mman.h @@ -4,6 +4,7 @@ #include #include +#include #define MREMAP_MAYMOVE 1 #define MREMAP_FIXED 2 @@ -41,4 +42,17 @@ #define MAP_HUGE_2GB HUGETLB_FLAG_ENCODE_2GB #define MAP_HUGE_16GB HUGETLB_FLAG_ENCODE_16GB +struct cachestat_range { + __u64 off; + __u64 len; +}; + +struct cachestat { + __u64 nr_cache; + __u64 nr_dirty; + __u64 nr_writeback; + __u64 nr_evicted; + __u64 nr_recently_evicted; +}; + #endif /* _UAPI_LINUX_MMAN_H */ diff --git a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl index 3f1886ad9d80..cfda2511badf 100644 --- a/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl +++ b/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl @@ -365,3 +365,4 @@ 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 n64 cachestat sys_cachestat diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl index a0be127475b1..8c0b08b7a80e 100644 --- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl @@ -537,3 +537,4 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl index b68f47541169..a6935af2235c 100644 --- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl +++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl @@ -453,3 +453,4 @@ 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat sys_cachestat diff --git a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl index c84d12608cd2..227538b0ce80 100644 --- a/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl +++ b/tools/perf/arch/x86/entry/syscalls/syscall_64.tbl @@ -372,6 +372,7 @@ 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv 450 common set_mempolicy_home_node sys_set_mempolicy_home_node +451 common cachestat sys_cachestat # # Due to a historical design error, certain syscalls are numbered differently -- cgit v1.2.3 From 1feece2780ac2f8de45177fe53979726cee4b3d1 Mon Sep 17 00:00:00 2001 From: James Clark Date: Fri, 7 Jul 2023 16:45:46 +0100 Subject: perf build: Fix library not found error when using CSLIBS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit -L only specifies the search path for libraries directly provided in the link line with -l. Because -lopencsd isn't specified, it's only linked because it's a dependency of -lopencsd_c_api. Dependencies like this are resolved using the default system search paths or -rpath-link=... rather than -L. This means that compilation only works if OpenCSD is installed to the system rather than provided with the CSLIBS (-L) option. This could be fixed by adding -Wl,-rpath-link=$(CSLIBS) but that is less conventional than just adding -lopencsd to the link line so that it uses -L. -lopencsd seems to have been removed in commit ed17b1914978eddb ("perf tools: Drop requirement for libstdc++.so for libopencsd check") because it was thought that there was a chance compilation would work even if it didn't exist, but I think that only applies to libstdc++ so there is no harm to add it back. libopencsd.so and libopencsd_c_api.so would always exist together. Testing ======= The following scenarios now all work: * Cross build with OpenCSD installed * Cross build using CSLIBS=... * Native build with OpenCSD installed * Native build using CSLIBS=... * Static cross build with OpenCSD installed * Static cross build with CSLIBS=... Committer testing: ⬢[acme@toolbox perf-tools]$ alias m alias m='make -k BUILD_BPF_SKEL=1 CORESIGHT=1 O=/tmp/build/perf-tools -C tools/perf install-bin && git status && perf test python ; perf record -o /dev/null sleep 0.01 ; perf stat --null sleep 0.01' ⬢[acme@toolbox perf-tools]$ ldd ~/bin/perf | grep csd libopencsd_c_api.so.1 => /lib64/libopencsd_c_api.so.1 (0x00007fd49c44e000) libopencsd.so.1 => /lib64/libopencsd.so.1 (0x00007fd49bd56000) ⬢[acme@toolbox perf-tools]$ cat /etc/redhat-release Fedora release 36 (Thirty Six) ⬢[acme@toolbox perf-tools]$ Fixes: ed17b1914978eddb ("perf tools: Drop requirement for libstdc++.so for libopencsd check") Reported-by: Radhey Shyam Pandey Signed-off-by: James Clark Tested-by: Arnaldo Carvalho de Melo Tested-by: Radhey Shyam Pandey Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Uwe Kleine-König Cc: coresight@lists.linaro.org Closes: https://lore.kernel.org/linux-arm-kernel/56905d7a-a91e-883a-b707-9d5f686ba5f1@arm.com/ Link: https://lore.kernel.org/all/36cc4dc6-bf4b-1093-1c0a-876e368af183@kleine-koenig.org/ Link: https://lore.kernel.org/r/20230707154546.456720-1-james.clark@arm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/Makefile.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 0609c19caabd..c5db0de49868 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -155,9 +155,9 @@ FEATURE_CHECK_LDFLAGS-libcrypto = -lcrypto ifdef CSINCLUDES LIBOPENCSD_CFLAGS := -I$(CSINCLUDES) endif -OPENCSDLIBS := -lopencsd_c_api +OPENCSDLIBS := -lopencsd_c_api -lopencsd ifeq ($(findstring -static,${LDFLAGS}),-static) - OPENCSDLIBS += -lopencsd -lstdc++ + OPENCSDLIBS += -lstdc++ endif ifdef CSLIBS LIBOPENCSD_LDFLAGS := -L$(CSLIBS) -- cgit v1.2.3 From 8d40f74ebf217d3b9e9b7481721e6236b857cc55 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Thu, 6 Jul 2023 12:04:40 +0530 Subject: perf vendor events amd: Fix large metrics There are cases where a metric requires more events than the number of available counters. E.g. AMD Zen, Zen 2 and Zen 3 processors have four data fabric counters but the "nps1_die_to_dram" metric has eight events. By default, the constituent events are placed in a group and since the events cannot be scheduled at the same time, the metric is not computed. The "all metrics" test also fails because of this. Use the NO_GROUP_EVENTS constraint for such metrics which anyway expect the user to run perf with "--metric-no-group". E.g. $ sudo perf test -v 101 Before: 101: perf all metrics test : --- start --- test child forked, pid 37131 Testing branch_misprediction_ratio Testing all_remote_links_outbound Testing nps1_die_to_dram Metric 'nps1_die_to_dram' not printed in: Error: Invalid event (dram_channel_data_controller_4) in per-thread mode, enable system wide with '-a'. Testing macro_ops_dispatched Testing all_l2_cache_accesses Testing all_l2_cache_hits Testing all_l2_cache_misses Testing ic_fetch_miss_ratio Testing l2_cache_accesses_from_l2_hwpf Testing l2_cache_misses_from_l2_hwpf Testing op_cache_fetch_miss_ratio Testing l3_read_miss_latency Testing l1_itlb_misses test child finished with -1 ---- end ---- perf all metrics test: FAILED! After: 101: perf all metrics test : --- start --- test child forked, pid 43766 Testing branch_misprediction_ratio Testing all_remote_links_outbound Testing nps1_die_to_dram Testing macro_ops_dispatched Testing all_l2_cache_accesses Testing all_l2_cache_hits Testing all_l2_cache_misses Testing ic_fetch_miss_ratio Testing l2_cache_accesses_from_l2_hwpf Testing l2_cache_misses_from_l2_hwpf Testing op_cache_fetch_miss_ratio Testing l3_read_miss_latency Testing l1_itlb_misses test child finished with 0 ---- end ---- perf all metrics test: Ok Reported-by: Ayush Jain Suggested-by: Ian Rogers Signed-off-by: Sandipan Das Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Santosh Shukla Link: https://lore.kernel.org/r/20230706063440.54189-1-sandipan.das@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/pmu-events/arch/x86/amdzen1/recommended.json | 3 ++- tools/perf/pmu-events/arch/x86/amdzen2/recommended.json | 3 ++- tools/perf/pmu-events/arch/x86/amdzen3/recommended.json | 3 ++- 3 files changed, 6 insertions(+), 3 deletions(-) diff --git a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json index bf5083c1c260..4d28177325a0 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen1/recommended.json @@ -169,8 +169,9 @@ }, { "MetricName": "nps1_die_to_dram", - "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)", + "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die)", "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricGroup": "data_fabric", "PerPkg": "1", "ScaleUnit": "6.1e-5MiB" diff --git a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json index a71694a043ba..60e19456d4c8 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen2/recommended.json @@ -169,8 +169,9 @@ }, { "MetricName": "nps1_die_to_dram", - "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)", + "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die)", "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricGroup": "data_fabric", "PerPkg": "1", "ScaleUnit": "6.1e-5MiB" diff --git a/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json index 988cf68ae825..3e9e1781812e 100644 --- a/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json +++ b/tools/perf/pmu-events/arch/x86/amdzen3/recommended.json @@ -205,10 +205,11 @@ }, { "MetricName": "nps1_die_to_dram", - "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die) (may need --metric-no-group)", + "BriefDescription": "Approximate: Combined DRAM B/bytes of all channels on a NPS1 node (die)", "MetricExpr": "dram_channel_data_controller_0 + dram_channel_data_controller_1 + dram_channel_data_controller_2 + dram_channel_data_controller_3 + dram_channel_data_controller_4 + dram_channel_data_controller_5 + dram_channel_data_controller_6 + dram_channel_data_controller_7", "MetricGroup": "data_fabric", "PerPkg": "1", + "MetricConstraint": "NO_GROUP_EVENTS", "ScaleUnit": "6.1e-5MiB" } ] -- cgit v1.2.3 From 48fa42c94547d80586db4eae8c1f5d0592f18ebd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jul 2023 12:23:53 -0300 Subject: tools headers uapi: Sync linux/fcntl.h with the kernel sources To get the changes in: 96b2b072ee62be8a ("exportfs: allow exporting non-decodeable file handles to userspace") That don't add anything that is handled by existing hard coded tables or table generation scripts. This silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/fcntl.h include/uapi/linux/fcntl.h Cc: Adrian Hunter Cc: Amir Goldstein Cc: Ian Rogers Cc: Jan Kara Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZK11P5AwRBUxxutI@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/fcntl.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tools/include/uapi/linux/fcntl.h b/tools/include/uapi/linux/fcntl.h index e8c07da58c9f..6c80f96049bd 100644 --- a/tools/include/uapi/linux/fcntl.h +++ b/tools/include/uapi/linux/fcntl.h @@ -112,4 +112,9 @@ #define AT_RECURSIVE 0x8000 /* Apply to the entire subtree */ +/* Flags for name_to_handle_at(2). We reuse AT_ flag space to save bits... */ +#define AT_HANDLE_FID AT_REMOVEDIR /* file handle is needed to + compare object identity and may not + be usable to open_by_handle_at(2) */ + #endif /* _UAPI_LINUX_FCNTL_H */ -- cgit v1.2.3 From b19c98f237cd76981aaded52c258ce93f7daa8cb Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Jun 2023 01:05:41 -0400 Subject: btrfs: fix race between balance and cancel/pause Syzbot reported a panic that looks like this: assertion failed: fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED, in fs/btrfs/ioctl.c:465 ------------[ cut here ]------------ kernel BUG at fs/btrfs/messages.c:259! RIP: 0010:btrfs_assertfail+0x2c/0x30 fs/btrfs/messages.c:259 Call Trace: btrfs_exclop_balance fs/btrfs/ioctl.c:465 [inline] btrfs_ioctl_balance fs/btrfs/ioctl.c:3564 [inline] btrfs_ioctl+0x531e/0x5b30 fs/btrfs/ioctl.c:4632 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:870 [inline] __se_sys_ioctl fs/ioctl.c:856 [inline] __x64_sys_ioctl+0x197/0x210 fs/ioctl.c:856 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x39/0xb0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd The reproducer is running a balance and a cancel or pause in parallel. The way balance finishes is a bit wonky, if we were paused we need to save the balance_ctl in the fs_info, but clear it otherwise and cleanup. However we rely on the return values being specific errors, or having a cancel request or no pause request. If balance completes and returns 0, but we have a pause or cancel request we won't do the appropriate cleanup, and then the next time we try to start a balance we'll trip this ASSERT. The error handling is just wrong here, we always want to clean up, unless we got -ECANCELLED and we set the appropriate pause flag in the exclusive op. With this patch the reproducer ran for an hour without tripping, previously it would trip in less than a few minutes. Reported-by: syzbot+c0f3acf145cb465426d5@syzkaller.appspotmail.com CC: stable@vger.kernel.org # 6.1+ Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index b8540af6e136..30b8744a7591 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -4081,14 +4081,6 @@ static int alloc_profile_is_valid(u64 flags, int extended) return has_single_bit_set(flags); } -static inline int balance_need_close(struct btrfs_fs_info *fs_info) -{ - /* cancel requested || normal exit path */ - return atomic_read(&fs_info->balance_cancel_req) || - (atomic_read(&fs_info->balance_pause_req) == 0 && - atomic_read(&fs_info->balance_cancel_req) == 0); -} - /* * Validate target profile against allowed profiles and return true if it's OK. * Otherwise print the error message and return false. @@ -4278,6 +4270,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, u64 num_devices; unsigned seq; bool reducing_redundancy; + bool paused = false; int i; if (btrfs_fs_closing(fs_info) || @@ -4408,6 +4401,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req)) { btrfs_info(fs_info, "balance: paused"); btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED); + paused = true; } /* * Balance can be canceled by: @@ -4436,8 +4430,8 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, btrfs_update_ioctl_balance_args(fs_info, bargs); } - if ((ret && ret != -ECANCELED && ret != -ENOSPC) || - balance_need_close(fs_info)) { + /* We didn't pause, we can clean everything up. */ + if (!paused) { reset_balance_state(fs_info); btrfs_exclop_finish(fs_info); } -- cgit v1.2.3 From 4e7de35eb7d1a1d4f2dda15f39fbedd4798a0b8d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jun 2023 08:13:23 +0200 Subject: btrfs: be a bit more careful when setting mirror_num_ret in btrfs_map_block The mirror_num_ret is allowed to be NULL, although it has to be set when smap is set. Unfortunately that is not a well enough specifiable invariant for static type checkers, so add a NULL check to make sure they are fine. Fixes: 03793cbbc80f ("btrfs: add fast path for single device io in __btrfs_map_block") Reported-by: Dan Carpenter Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 30b8744a7591..efa19a528c33 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6398,7 +6398,8 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, (op == BTRFS_MAP_READ || !dev_replace_is_ongoing || !dev_replace->tgtdev)) { set_io_stripe(smap, map, stripe_index, stripe_offset, stripe_nr); - *mirror_num_ret = mirror_num; + if (mirror_num_ret) + *mirror_num_ret = mirror_num; *bioc_ret = NULL; ret = 0; goto out; -- cgit v1.2.3 From 0657b20c5a76c938612f8409735a8830d257866e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 28 Jun 2023 17:13:37 +0100 Subject: btrfs: fix use-after-free of new block group that became unused If a task creates a new block group and that block group becomes unused before we finish its creation, at btrfs_create_pending_block_groups(), then when btrfs_mark_bg_unused() is called against the block group, we assume that the block group is currently in the list of block groups to reclaim, and we move it out of the list of new block groups and into the list of unused block groups. This has two consequences: 1) We move it out of the list of new block groups associated to the current transaction. So the block group creation is not finished and if we attempt to delete the bg because it's unused, we will not find the block group item in the extent tree (or the new block group tree), its device extent items in the device tree etc, resulting in the deletion to fail due to the missing items; 2) We don't increment the reference count on the block group when we move it to the list of unused block groups, because we assumed the block group was on the list of block groups to reclaim, and in that case it already has the correct reference count. However the block group was on the list of new block groups, in which case no extra reference was taken because it's local to the current task. This later results in doing an extra reference count decrement when removing the block group from the unused list, eventually leading the reference count to 0. This second case was caught when running generic/297 from fstests, which produced the following assertion failure and stack trace: [589.559] assertion failed: refcount_read(&block_group->refs) == 1, in fs/btrfs/block-group.c:4299 [589.559] ------------[ cut here ]------------ [589.559] kernel BUG at fs/btrfs/block-group.c:4299! [589.560] invalid opcode: 0000 [#1] PREEMPT SMP PTI [589.560] CPU: 8 PID: 2819134 Comm: umount Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [589.560] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [589.560] RIP: 0010:btrfs_free_block_groups+0x449/0x4a0 [btrfs] [589.561] Code: 68 62 da c0 (...) [589.561] RSP: 0018:ffffa55a8c3b3d98 EFLAGS: 00010246 [589.561] RAX: 0000000000000058 RBX: ffff8f030d7f2000 RCX: 0000000000000000 [589.562] RDX: 0000000000000000 RSI: ffffffff953f0878 RDI: 00000000ffffffff [589.562] RBP: ffff8f030d7f2088 R08: 0000000000000000 R09: ffffa55a8c3b3c50 [589.562] R10: 0000000000000001 R11: 0000000000000001 R12: ffff8f05850b4c00 [589.562] R13: ffff8f030d7f2090 R14: ffff8f05850b4cd8 R15: dead000000000100 [589.563] FS: 00007f497fd2e840(0000) GS:ffff8f09dfc00000(0000) knlGS:0000000000000000 [589.563] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [589.563] CR2: 00007f497ff8ec10 CR3: 0000000271472006 CR4: 0000000000370ee0 [589.563] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [589.564] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [589.564] Call Trace: [589.564] [589.565] ? __die_body+0x1b/0x60 [589.565] ? die+0x39/0x60 [589.565] ? do_trap+0xeb/0x110 [589.565] ? btrfs_free_block_groups+0x449/0x4a0 [btrfs] [589.566] ? do_error_trap+0x6a/0x90 [589.566] ? btrfs_free_block_groups+0x449/0x4a0 [btrfs] [589.566] ? exc_invalid_op+0x4e/0x70 [589.566] ? btrfs_free_block_groups+0x449/0x4a0 [btrfs] [589.567] ? asm_exc_invalid_op+0x16/0x20 [589.567] ? btrfs_free_block_groups+0x449/0x4a0 [btrfs] [589.567] ? btrfs_free_block_groups+0x449/0x4a0 [btrfs] [589.567] close_ctree+0x35d/0x560 [btrfs] [589.568] ? fsnotify_sb_delete+0x13e/0x1d0 [589.568] ? dispose_list+0x3a/0x50 [589.568] ? evict_inodes+0x151/0x1a0 [589.568] generic_shutdown_super+0x73/0x1a0 [589.569] kill_anon_super+0x14/0x30 [589.569] btrfs_kill_super+0x12/0x20 [btrfs] [589.569] deactivate_locked_super+0x2e/0x70 [589.569] cleanup_mnt+0x104/0x160 [589.570] task_work_run+0x56/0x90 [589.570] exit_to_user_mode_prepare+0x160/0x170 [589.570] syscall_exit_to_user_mode+0x22/0x50 [589.570] ? __x64_sys_umount+0x12/0x20 [589.571] do_syscall_64+0x48/0x90 [589.571] entry_SYSCALL_64_after_hwframe+0x72/0xdc [589.571] RIP: 0033:0x7f497ff0a567 [589.571] Code: af 98 0e (...) [589.572] RSP: 002b:00007ffc98347358 EFLAGS: 00000246 ORIG_RAX: 00000000000000a6 [589.572] RAX: 0000000000000000 RBX: 00007f49800b8264 RCX: 00007f497ff0a567 [589.572] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000557f558abfa0 [589.573] RBP: 0000557f558a6ba0 R08: 0000000000000000 R09: 00007ffc98346100 [589.573] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [589.573] R13: 0000557f558abfa0 R14: 0000557f558a6cb0 R15: 0000557f558a6dd0 [589.573] [589.574] Modules linked in: dm_snapshot dm_thin_pool (...) [589.576] ---[ end trace 0000000000000000 ]--- Fix this by adding a runtime flag to the block group to tell that the block group is still in the list of new block groups, and therefore it should not be moved to the list of unused block groups, at btrfs_mark_bg_unused(), until the flag is cleared, when we finish the creation of the block group at btrfs_create_pending_block_groups(). Fixes: a9f189716cf1 ("btrfs: move out now unused BG from the reclaim list") CC: stable@vger.kernel.org # 5.15+ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 13 +++++++++++-- fs/btrfs/block-group.h | 5 +++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6753524b146c..f53297726238 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1640,13 +1640,14 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - trace_btrfs_add_unused_block_group(bg); spin_lock(&fs_info->unused_bgs_lock); if (list_empty(&bg->bg_list)) { btrfs_get_block_group(bg); + trace_btrfs_add_unused_block_group(bg); list_add_tail(&bg->bg_list, &fs_info->unused_bgs); - } else { + } else if (!test_bit(BLOCK_GROUP_FLAG_NEW, &bg->runtime_flags)) { /* Pull out the block group from the reclaim_bgs list. */ + trace_btrfs_add_unused_block_group(bg); list_move_tail(&bg->bg_list, &fs_info->unused_bgs); } spin_unlock(&fs_info->unused_bgs_lock); @@ -2668,6 +2669,7 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) next: btrfs_delayed_refs_rsv_release(fs_info, 1); list_del_init(&block_group->bg_list); + clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); } btrfs_trans_release_chunk_metadata(trans); } @@ -2707,6 +2709,13 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran if (!cache) return ERR_PTR(-ENOMEM); + /* + * Mark it as new before adding it to the rbtree of block groups or any + * list, so that no other task finds it and calls btrfs_mark_bg_unused() + * before the new flag is set. + */ + set_bit(BLOCK_GROUP_FLAG_NEW, &cache->runtime_flags); + cache->length = size; set_free_space_tree_thresholds(cache); cache->flags = type; diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index f204addc3fe8..381c54a56417 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -70,6 +70,11 @@ enum btrfs_block_group_flags { BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, /* Indicate that the block group is placed on a sequential zone */ BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE, + /* + * Indicate that block group is in the list of new block groups of a + * transaction. + */ + BLOCK_GROUP_FLAG_NEW, }; enum btrfs_caching_type { -- cgit v1.2.3 From 225bbf44bffd30dfde7449c9e74b3ea238397113 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jul 2023 12:33:30 -0300 Subject: tools headers UAPI: Sync linux/kvm.h with the kernel sources To pick the changes in: 89d01306e34d6ace ("RISC-V: KVM: Implement device interface for AIA irqchip") 22725266bdf95bdd ("KVM: Fix comment for KVM_ENABLE_CAP") 2f440b72e852be42 ("KVM: arm64: Add KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE") That just rebuilds perf, as these patches don't add any new KVM ioctl to be harvested for the the 'perf trace' ioctl syscall argument beautifiers. This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/kvm.h include/uapi/linux/kvm.h Cc: Adrian Hunter Cc: Anup Patel Cc: Binbin Wu Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Oliver Upton Cc: Ricardo Koller Cc: Sean Christopherson Link: https://lore.kernel.org/lkml/ZK12+virXMIXMysy@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/kvm.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/include/uapi/linux/kvm.h b/tools/include/uapi/linux/kvm.h index 737318b1c1d9..f089ab290978 100644 --- a/tools/include/uapi/linux/kvm.h +++ b/tools/include/uapi/linux/kvm.h @@ -1190,6 +1190,8 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP 225 #define KVM_CAP_PMU_EVENT_MASKED_EVENTS 226 #define KVM_CAP_COUNTER_OFFSET 227 +#define KVM_CAP_ARM_EAGER_SPLIT_CHUNK_SIZE 228 +#define KVM_CAP_ARM_SUPPORTED_BLOCK_SIZES 229 #ifdef KVM_CAP_IRQ_ROUTING @@ -1442,6 +1444,8 @@ enum kvm_device_type { #define KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_XIVE KVM_DEV_TYPE_ARM_PV_TIME, #define KVM_DEV_TYPE_ARM_PV_TIME KVM_DEV_TYPE_ARM_PV_TIME + KVM_DEV_TYPE_RISCV_AIA, +#define KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_RISCV_AIA KVM_DEV_TYPE_MAX, }; @@ -1613,7 +1617,7 @@ struct kvm_s390_ucas_mapping { #define KVM_GET_DEBUGREGS _IOR(KVMIO, 0xa1, struct kvm_debugregs) #define KVM_SET_DEBUGREGS _IOW(KVMIO, 0xa2, struct kvm_debugregs) /* - * vcpu version available with KVM_ENABLE_CAP + * vcpu version available with KVM_CAP_ENABLE_CAP * vm version available with KVM_CAP_ENABLE_CAP_VM */ #define KVM_ENABLE_CAP _IOW(KVMIO, 0xa3, struct kvm_enable_cap) -- cgit v1.2.3 From 920b91d92702784f34b7c70b86ca051ba1c94430 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jul 2023 12:51:37 -0300 Subject: tools include UAPI: Sync linux/mount.h copy with the kernel sources To pick the changes from: 6ac392815628f317 ("fs: allow to mount beneath top mount") That, after a fix to the move_mount_flags.sh script, harvests the new MOVE_MOUNT_BENEATH move_mount flag: $ tools/perf/trace/beauty/move_mount_flags.sh > before $ cp include/uapi/linux/mount.h tools/include/uapi/linux/mount.h $ tools/perf/trace/beauty/move_mount_flags.sh > after $ $ diff -u before after --- before 2023-07-11 12:38:49.244886707 -0300 +++ after 2023-07-11 12:51:15.125255940 -0300 @@ -6,4 +6,5 @@ [ilog2(0x00000020) + 1] = "T_AUTOMOUNTS", [ilog2(0x00000040) + 1] = "T_EMPTY_PATH", [ilog2(0x00000100) + 1] = "SET_GROUP", + [ilog2(0x00000200) + 1] = "BENEATH", }; $ That will then be properly decoded when used in tools like: # perf trace -e move_mount This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/mount.h include/uapi/linux/mount.h Cc: Christian Brauner Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZK17kifP%2FiYl+Hcc@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/mount.h | 3 ++- tools/perf/trace/beauty/move_mount_flags.sh | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/linux/mount.h b/tools/include/uapi/linux/mount.h index 4d93967f8aea..8eb0d7b758d2 100644 --- a/tools/include/uapi/linux/mount.h +++ b/tools/include/uapi/linux/mount.h @@ -74,7 +74,8 @@ #define MOVE_MOUNT_T_AUTOMOUNTS 0x00000020 /* Follow automounts on to path */ #define MOVE_MOUNT_T_EMPTY_PATH 0x00000040 /* Empty to path permitted */ #define MOVE_MOUNT_SET_GROUP 0x00000100 /* Set sharing group instead */ -#define MOVE_MOUNT__MASK 0x00000177 +#define MOVE_MOUNT_BENEATH 0x00000200 /* Mount beneath top mount */ +#define MOVE_MOUNT__MASK 0x00000377 /* * fsopen() flags. diff --git a/tools/perf/trace/beauty/move_mount_flags.sh b/tools/perf/trace/beauty/move_mount_flags.sh index 32e552faf37a..ce5e632d1448 100755 --- a/tools/perf/trace/beauty/move_mount_flags.sh +++ b/tools/perf/trace/beauty/move_mount_flags.sh @@ -10,7 +10,7 @@ fi linux_mount=${linux_header_dir}/mount.h printf "static const char *move_mount_flags[] = {\n" -regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MOVE_MOUNT_([^_]+_[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' +regex='^[[:space:]]*#[[:space:]]*define[[:space:]]+MOVE_MOUNT_([^_]+[[:alnum:]_]+)[[:space:]]+(0x[[:xdigit:]]+)[[:space:]]*.*' grep -E $regex ${linux_mount} | \ sed -r "s/$regex/\2 \1/g" | \ xargs printf "\t[ilog2(%s) + 1] = \"%s\",\n" -- cgit v1.2.3 From a87834d19aa2bf455e2fd9309ef4754eccd8459a Mon Sep 17 00:00:00 2001 From: Thomas Richter Date: Tue, 11 Jul 2023 15:53:38 +0200 Subject: perf build: Fix broken feature check for libtracefs due to external lib changes The perf build process auto-detects features and packages already installed for its build. This is done in directory tools/build/feature. This directory contains small sample programs. When they successfully compile the necessary prereqs in form of libraries and header files are present. Such a check is also done for libtracefs. And this check fails: Output before: # rm -f test-libtracefs.bin; make test-libtracefs.bin gcc -MD -Wall -Werror -o test-libtracefs.bin test-libtracefs.c \ > test-libtracefs.make.output 2>&1 -ltracefs make: *** [Makefile:211: test-libtracefs.bin] Error 1 # cat test-libtracefs.make.output In file included from test-libtracefs.c:2: /usr/include/tracefs/tracefs.h:11:10: fatal error: \ event-parse.h: No such file or directory 11 | #include | ^~~~~~~~~~~~~~~ compilation terminated. # The root cause of this compile error is commit 880885d9c22e ("libtracefs: Remove "traceevent/" from referencing libtraceevent headers") in the libtracefs project hosted here: https://git.kernel.org/pub/scm/libs/libtrace/libtracefs.git/ That mentioned patch removes the traceevent/ directory name from the include statement, causing the file not to be included even when the libtraceevent-devel package is installed. This package contains the file referred to in tracefs/tracefs.h: # rpm -ql libtraceevent-devel /usr/include/traceevent /usr/include/traceevent/event-parse.h <----- here /usr/include/traceevent/event-utils.h /usr/include/traceevent/kbuffer.h /usr/include/traceevent/trace-seq.h /usr/lib64/libtraceevent.so /usr/lib64/pkgconfig/libtraceevent.pc # With this patch the compile succeeds. Output after: # rm -f test-libtracefs.bin; make test-libtracefs.bin gcc -MD -Wall -Werror -o test-libtracefs.bin test-libtracefs.c \ > test-libtracefs.make.output 2>&1 -I/usr/include/traceevent -ltracefs # Committer testing: $ make -k BUILD_BPF_SKEL=1 CORESIGHT=1 O=/tmp/build/perf-tools -C tools/perf install-bin Before: $ cat /tmp/build/perf-tools/feature/test-libtracefs.make.output In file included from test-libtracefs.c:2: /usr/include/tracefs/tracefs.h:11:10: fatal error: event-parse.h: No such file or directory 11 | #include | ^~~~~~~~~~~~~~~ compilation terminated. $ $ grep -i tracefs /tmp/build/perf-tools/FEATURE-DUMP feature-libtracefs=0 $ After: $ cat /tmp/build/perf-tools/feature/test-libtracefs.make.output $ $ grep -i tracefs /tmp/build/perf-tools/FEATURE-DUMP feature-libtracefs=1 $ Signed-off-by: Thomas Richter Tested-by: Arnaldo Carvalho de Melo Cc: Heiko Carstens Cc: Jiri Olsa Cc: Steven Rostedt (VMware) Cc: Sumanth Korikkar Cc: Sven Schnelle Cc: Vasily Gorbik Link: https://lore.kernel.org/r/20230711135338.397473-1-tmricht@linux.ibm.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/build/feature/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index 0f0aa9b7d7b5..2cd6dbbee088 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -208,7 +208,7 @@ $(OUTPUT)test-libtraceevent.bin: $(BUILD) -ltraceevent $(OUTPUT)test-libtracefs.bin: - $(BUILD) -ltracefs + $(BUILD) $(shell $(PKG_CONFIG) --cflags libtraceevent 2>/dev/null) -ltracefs $(OUTPUT)test-libcrypto.bin: $(BUILD) -lcrypto -- cgit v1.2.3 From ad07149f34dbb3e0f4e25e19ef80bdd3216ba1cf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 11 Jul 2023 13:23:02 -0300 Subject: tools headers UAPI: Sync linux/prctl.h with the kernel sources To pick the changes in: 1fd96a3e9d5d4feb ("riscv: Add prctl controls for userspace vector management") That adds some RISC-V specific prctl options: $ tools/perf/trace/beauty/prctl_option.sh > before $ cp include/uapi/linux/prctl.h tools/include/uapi/linux/prctl.h $ tools/perf/trace/beauty/prctl_option.sh > after $ diff -u before after --- before 2023-07-11 13:22:01.928705942 -0300 +++ after 2023-07-11 13:22:36.342645970 -0300 @@ -63,6 +63,8 @@ [66] = "GET_MDWE", [67] = "SET_MEMORY_MERGE", [68] = "GET_MEMORY_MERGE", + [69] = "RISCV_V_SET_CONTROL", + [70] = "RISCV_V_GET_CONTROL", }; static const char *prctl_set_mm_options[] = { [1] = "START_CODE", $ That now will be used to decode the syscall option and also to compose filters, for instance: [root@five ~]# perf trace -e syscalls:sys_enter_prctl --filter option==SET_NAME 0.000 Isolated Servi/3474327 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23f13b7aee) 0.032 DOM Worker/3474327 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23deb25670) 7.920 :3474328/3474328 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24fbb10) 7.935 StreamT~s #374/3474328 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24fb970) 8.400 Isolated Servi/3474329 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24bab10) 8.418 StreamT~s #374/3474329 syscalls:sys_enter_prctl(option: SET_NAME, arg2: 0x7f23e24ba970) ^C[root@five ~]# This addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/include/uapi/linux/prctl.h include/uapi/linux/prctl.h Cc: Adrian Hunter Cc: Andy Chiu Cc: Ian Rogers Cc: Jiri Olsa Cc: Namhyung Kim Cc: Palmer Dabbelt Link: https://lore.kernel.org/lkml/ZK2DhOB6JJKu2A7M@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/prctl.h | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/tools/include/uapi/linux/prctl.h b/tools/include/uapi/linux/prctl.h index f23d9a16507f..3c36aeade991 100644 --- a/tools/include/uapi/linux/prctl.h +++ b/tools/include/uapi/linux/prctl.h @@ -294,4 +294,15 @@ struct prctl_mm_map { #define PR_SET_MEMORY_MERGE 67 #define PR_GET_MEMORY_MERGE 68 + +#define PR_RISCV_V_SET_CONTROL 69 +#define PR_RISCV_V_GET_CONTROL 70 +# define PR_RISCV_V_VSTATE_CTRL_DEFAULT 0 +# define PR_RISCV_V_VSTATE_CTRL_OFF 1 +# define PR_RISCV_V_VSTATE_CTRL_ON 2 +# define PR_RISCV_V_VSTATE_CTRL_INHERIT (1 << 4) +# define PR_RISCV_V_VSTATE_CTRL_CUR_MASK 0x3 +# define PR_RISCV_V_VSTATE_CTRL_NEXT_MASK 0xc +# define PR_RISCV_V_VSTATE_CTRL_MASK 0x1f + #endif /* _LINUX_PRCTL_H */ -- cgit v1.2.3 From f4d1a8e011909fee24643f84bd1196e1366c26f2 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Fri, 7 Jul 2023 21:01:26 -0700 Subject: scsi: storvsc: Handle SRB status value 0x30 In response to a disk I/O request, Hyper-V has been observed to return SRB status value 0x30. This indicates the request was not processed by Hyper-V because low memory conditions on the host caused an internal error. The 0x30 status is not recognized by storvsc, so the I/O operation is not flagged as an error. The request is treated as if it completed normally but with zero data transferred, causing a flood of retries. Add a definition for this SRB status value and handle it like other error statuses from the Hyper-V host. Signed-off-by: Michael Kelley Link: https://lore.kernel.org/r/1688788886-94279-1-git-send-email-mikelley@microsoft.com Signed-off-by: Martin K. Petersen --- drivers/scsi/storvsc_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 659196a2f63a..7f12d931fe7c 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -318,6 +318,7 @@ enum storvsc_request_type { #define SRB_STATUS_INVALID_REQUEST 0x06 #define SRB_STATUS_DATA_OVERRUN 0x12 #define SRB_STATUS_INVALID_LUN 0x20 +#define SRB_STATUS_INTERNAL_ERROR 0x30 #define SRB_STATUS(status) \ (status & ~(SRB_STATUS_AUTOSENSE_VALID | SRB_STATUS_QUEUE_FROZEN)) @@ -978,6 +979,7 @@ static void storvsc_handle_error(struct vmscsi_request *vm_srb, case SRB_STATUS_ERROR: case SRB_STATUS_ABORTED: case SRB_STATUS_INVALID_REQUEST: + case SRB_STATUS_INTERNAL_ERROR: if (vm_srb->srb_status & SRB_STATUS_AUTOSENSE_VALID) { /* Check for capacity change */ if ((asc == 0x2a) && (ascq == 0x9)) { -- cgit v1.2.3 From 123ec246ebe323d468c5ca996700ea4739d20ddf Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jun 2023 00:12:39 +0800 Subject: erofs: get rid of the remaining kmap_atomic() It's unnecessary to use kmap_atomic() compared with kmap_local_page(). In addition, kmap_atomic() is deprecated now. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230627161240.331-1-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2a29943fa5cc..ad53cf52d899 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -148,7 +148,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, *maptype = 0; return inpage; } - kunmap_atomic(inpage); + kunmap_local(inpage); might_sleep(); src = erofs_vm_map_ram(rq->in, ctx->inpages); if (!src) @@ -162,7 +162,7 @@ docopy: src = erofs_get_pcpubuf(ctx->inpages); if (!src) { DBG_BUGON(1); - kunmap_atomic(inpage); + kunmap_local(inpage); return ERR_PTR(-EFAULT); } @@ -173,9 +173,9 @@ docopy: min_t(unsigned int, total, PAGE_SIZE - *inputmargin); if (!inpage) - inpage = kmap_atomic(*in); + inpage = kmap_local_page(*in); memcpy(tmp, inpage + *inputmargin, page_copycnt); - kunmap_atomic(inpage); + kunmap_local(inpage); inpage = NULL; tmp += page_copycnt; total -= page_copycnt; @@ -214,7 +214,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, int ret, maptype; DBG_BUGON(*rq->in == NULL); - headpage = kmap_atomic(*rq->in); + headpage = kmap_local_page(*rq->in); /* LZ4 decompression inplace is only safe if zero_padding is enabled */ if (erofs_sb_has_zero_padding(EROFS_SB(rq->sb))) { @@ -223,7 +223,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, min_t(unsigned int, rq->inputsize, rq->sb->s_blocksize - rq->pageofs_in)); if (ret) { - kunmap_atomic(headpage); + kunmap_local(headpage); return ret; } may_inplace = !((rq->pageofs_in + rq->inputsize) & @@ -261,7 +261,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } if (maptype == 0) { - kunmap_atomic(headpage); + kunmap_local(headpage); } else if (maptype == 1) { vm_unmap_ram(src, ctx->inpages); } else if (maptype == 2) { @@ -289,7 +289,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, /* one optimized fast path only for non bigpcluster cases yet */ if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); - dst = kmap_atomic(*rq->out); + dst = kmap_local_page(*rq->out); dst_maptype = 0; goto dstmap_out; } @@ -311,7 +311,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, dstmap_out: ret = z_erofs_lz4_decompress_mem(&ctx, dst + rq->pageofs_out); if (!dst_maptype) - kunmap_atomic(dst); + kunmap_local(dst); else if (dst_maptype == 2) vm_unmap_ram(dst, ctx.outpages); return ret; -- cgit v1.2.3 From c5539762f32e97c5e16215fa1336e32095b8b0fd Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Wed, 28 Jun 2023 00:12:40 +0800 Subject: erofs: simplify z_erofs_transform_plain() Use memcpy_to_page() instead of open-coding them. In addition, add a missing flush_dcache_page() even though almost all modern architectures clear `PG_dcache_clean` flag for new file cache pages so that it doesn't change anything in practice. Signed-off-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230627161240.331-2-hsiangkao@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/decompressor.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index ad53cf52d899..cfad1eac7fd9 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -328,7 +328,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, const unsigned int lefthalf = rq->outputsize - righthalf; const unsigned int interlaced_offset = rq->alg == Z_EROFS_COMPRESSION_SHIFTED ? 0 : rq->pageofs_out; - unsigned char *src, *dst; + u8 *src; if (outpages > 2 && rq->alg == Z_EROFS_COMPRESSION_SHIFTED) { DBG_BUGON(1); @@ -341,22 +341,19 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, } src = kmap_local_page(rq->in[inpages - 1]) + rq->pageofs_in; - if (rq->out[0]) { - dst = kmap_local_page(rq->out[0]); - memcpy(dst + rq->pageofs_out, src + interlaced_offset, - righthalf); - kunmap_local(dst); - } + if (rq->out[0]) + memcpy_to_page(rq->out[0], rq->pageofs_out, + src + interlaced_offset, righthalf); if (outpages > inpages) { DBG_BUGON(!rq->out[outpages - 1]); if (rq->out[outpages - 1] != rq->in[inpages - 1]) { - dst = kmap_local_page(rq->out[outpages - 1]); - memcpy(dst, interlaced_offset ? src : - (src + righthalf), lefthalf); - kunmap_local(dst); + memcpy_to_page(rq->out[outpages - 1], 0, src + + (interlaced_offset ? 0 : righthalf), + lefthalf); } else if (!interlaced_offset) { memmove(src, src + righthalf, lefthalf); + flush_dcache_page(rq->in[inpages - 1]); } } kunmap_local(src); -- cgit v1.2.3 From 936aa701d82d397c2d1afcd18ce2c739471d978d Mon Sep 17 00:00:00 2001 From: Chunhai Guo Date: Mon, 10 Jul 2023 12:25:31 +0800 Subject: erofs: avoid useless loops in z_erofs_pcluster_readmore() when reading beyond EOF z_erofs_pcluster_readmore() may take a long time to loop when the page offset is large enough, which is unnecessary should be prevented. For example, when the following case is encountered, it will loop 4691368 times, taking about 27 seconds: - offset = 19217289215 - inode_size = 1442672 Signed-off-by: Chunhai Guo Fixes: 386292919c25 ("erofs: introduce readmore decompression strategy") Reviewed-by: Gao Xiang Reviewed-by: Yue Hu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230710042531.28761-1-guochunhai@vivo.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5f1890e309c6..d9a0763f4595 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1841,7 +1841,7 @@ static void z_erofs_pcluster_readmore(struct z_erofs_decompress_frontend *f, } cur = map->m_la + map->m_llen - 1; - while (cur >= end) { + while ((cur >= end) && (cur < i_size_read(inode))) { pgoff_t index = cur >> PAGE_SHIFT; struct page *page; -- cgit v1.2.3 From 8191213a5835b0317c5e4d0d337ae1ae00c75253 Mon Sep 17 00:00:00 2001 From: Chunhai Guo Date: Mon, 10 Jul 2023 17:34:10 +0800 Subject: erofs: avoid infinite loop in z_erofs_do_read_page() when reading beyond EOF z_erofs_do_read_page() may loop infinitely due to the inappropriate truncation in the below statement. Since the offset is 64 bits and min_t() truncates the result to 32 bits. The solution is to replace unsigned int with a 64-bit type, such as erofs_off_t. cur = end - min_t(unsigned int, offset + end - map->m_la, end); - For example: - offset = 0x400160000 - end = 0x370 - map->m_la = 0x160370 - offset + end - map->m_la = 0x400000000 - offset + end - map->m_la = 0x00000000 (truncated as unsigned int) - Expected result: - cur = 0 - Actual result: - cur = 0x370 Signed-off-by: Chunhai Guo Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support") Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230710093410.44071-1-guochunhai@vivo.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d9a0763f4595..b69d89a11dd0 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1035,7 +1035,7 @@ hitted: */ tight &= (fe->mode > Z_EROFS_PCLUSTER_FOLLOWED_NOINPLACE); - cur = end - min_t(unsigned int, offset + end - map->m_la, end); + cur = end - min_t(erofs_off_t, offset + end - map->m_la, end); if (!(map->m_flags & EROFS_MAP_MAPPED)) { zero_user_segment(page, cur, end); goto next_part; -- cgit v1.2.3 From 18bddc5b67038722cb88fcf51fbf41a0277092cb Mon Sep 17 00:00:00 2001 From: Xin Yin Date: Tue, 11 Jul 2023 14:21:30 +0800 Subject: erofs: fix fsdax unavailability for chunk-based regular files DAX can be used to share page cache between VMs, reducing guest memory overhead. And chunk based data format is widely used for VM and container image. So enable dax support for it, make erofs better used for VM scenarios. Fixes: c5aa903a59db ("erofs: support reading chunk-based uncompressed files") Signed-off-by: Xin Yin Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20230711062130.7860-1-yinxin.x@bytedance.com Signed-off-by: Gao Xiang --- fs/erofs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d70b12b81507..e12592727a54 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -183,7 +183,8 @@ static void *erofs_read_inode(struct erofs_buf *buf, inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && - vi->datalayout == EROFS_INODE_FLAT_PLAIN) + (vi->datalayout == EROFS_INODE_FLAT_PLAIN || + vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; if (!nblks) -- cgit v1.2.3 From d6e724d3ef0b37aa425267921100c89e378eb4a9 Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Mon, 10 Jul 2023 12:33:30 -0700 Subject: Documentation: RISC-V: hwprobe: Fix a formatting error I'm not sure what I was trying to do with the ':'s, but they're just rendered to HTML which looks odd. This makes "fence.i" look like "mvendorid" and such, which is seems reasonable to me. Reviewed-by: Evan Green Link: https://lore.kernel.org/r/20230710193329.2742-1-palmer@rivosinc.com Signed-off-by: Palmer Dabbelt --- Documentation/riscv/hwprobe.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/riscv/hwprobe.rst b/Documentation/riscv/hwprobe.rst index 19165ebd82ba..933c715065d6 100644 --- a/Documentation/riscv/hwprobe.rst +++ b/Documentation/riscv/hwprobe.rst @@ -49,7 +49,7 @@ The following keys are defined: privileged ISA, with the following known exceptions (more exceptions may be added, but only if it can be demonstrated that the user ABI is not broken): - * The :fence.i: instruction cannot be directly executed by userspace + * The ``fence.i`` instruction cannot be directly executed by userspace programs (it may still be executed in userspace via a kernel-controlled mechanism such as the vDSO). -- cgit v1.2.3 From c9e4bf607d8c431452ef362c00e62ce999bbae93 Mon Sep 17 00:00:00 2001 From: Azat Khuzhin Date: Tue, 11 Jul 2023 13:48:12 +0200 Subject: PM: hibernate: Fix writing maj:min to /sys/power/resume resume_store() first calls lookup_bdev() and after tries to handle maj:min, but it does not reset the error before, hence if you will write maj:min you will get ENOENT: # echo 259:2 >| /sys/power/resume bash: echo: write error: No such file or directory This also should fix hiberation via systemd, since it uses this way. Fixes: 1e8c813b083c4 ("PM: hibernate: don't use early_lookup_bdev in resume_store") Signed-off-by: Azat Khuzhin Reviewed-by: Christoph Hellwig [ rjw: Subject edits ] Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index f62e89d0d906..e1b4bfa938dd 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -1179,6 +1179,7 @@ static ssize_t resume_store(struct kobject *kobj, struct kobj_attribute *attr, unsigned maj, min, offset; char *p, dummy; + error = 0; if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { -- cgit v1.2.3 From 3a8395b565b5b4f019b3dc182be4c4541eb35ac8 Mon Sep 17 00:00:00 2001 From: Chungkai Yang Date: Wed, 5 Jul 2023 16:59:07 +0800 Subject: PM: QoS: Restore support for default value on frequency QoS Commit 8d36694245f2 ("PM: QoS: Add check to make sure CPU freq is non-negative") makes sure CPU freq is non-negative to avoid negative value converting to unsigned data type. However, when the value is PM_QOS_DEFAULT_VALUE, pm_qos_update_target specifically uses c->default_value which is set to FREQ_QOS_MIN/MAX_DEFAULT_VALUE when cpufreq_policy_alloc is executed, for this case handling. Adding check for PM_QOS_DEFAULT_VALUE to let default setting work will fix this problem. Fixes: 8d36694245f2 ("PM: QoS: Add check to make sure CPU freq is non-negative") Link: https://lore.kernel.org/lkml/20230626035144.19717-1-Chung-kai.Yang@mediatek.com/ Link: https://lore.kernel.org/lkml/20230627071727.16646-1-Chung-kai.Yang@mediatek.com/ Link: https://lore.kernel.org/lkml/CAJZ5v0gxNOWhC58PHeUhW_tgf6d1fGJVZ1x91zkDdht11yUv-A@mail.gmail.com/ Signed-off-by: Chungkai Yang Cc: 6.0+ # 6.0+ Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/power/qos.c b/kernel/power/qos.c index af51ed6d45ef..782d3b41c1f3 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -426,6 +426,11 @@ late_initcall(cpu_latency_qos_init); /* Definitions related to the frequency QoS below. */ +static inline bool freq_qos_value_invalid(s32 value) +{ + return value < 0 && value != PM_QOS_DEFAULT_VALUE; +} + /** * freq_constraints_init - Initialize frequency QoS constraints. * @qos: Frequency QoS constraints to initialize. @@ -531,7 +536,7 @@ int freq_qos_add_request(struct freq_constraints *qos, { int ret; - if (IS_ERR_OR_NULL(qos) || !req || value < 0) + if (IS_ERR_OR_NULL(qos) || !req || freq_qos_value_invalid(value)) return -EINVAL; if (WARN(freq_qos_request_active(req), @@ -563,7 +568,7 @@ EXPORT_SYMBOL_GPL(freq_qos_add_request); */ int freq_qos_update_request(struct freq_qos_request *req, s32 new_value) { - if (!req || new_value < 0) + if (!req || freq_qos_value_invalid(new_value)) return -EINVAL; if (WARN(!freq_qos_request_active(req), -- cgit v1.2.3 From 4b96679170c63be361d1b0fdeb81bb0ef207dbcb Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Fri, 7 Jul 2023 16:09:26 -0700 Subject: libsubcmd: Avoid SEGV/use-after-free when commands aren't excluded The array shortening may perform unnecessary array copies. Before commit 657a3efee43a ("lib subcmd: Avoid memory leak in exclude_cmds") this was benign, but afterwards this could lead to a SEGV. Fixes: 657a3efee43a29d1 ("lib subcmd: Avoid memory leak in exclude_cmds") Signed-off-by: Ian Rogers Acked-by: Namhyung Kim Cc: Chenyuan Mi Cc: Ian Rogers Link: https://lore.kernel.org/r/20230707230926.841086-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/lib/subcmd/help.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tools/lib/subcmd/help.c b/tools/lib/subcmd/help.c index 67a8d6b740ea..adfbae27dc36 100644 --- a/tools/lib/subcmd/help.c +++ b/tools/lib/subcmd/help.c @@ -68,8 +68,13 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) while (ci < cmds->cnt && ei < excludes->cnt) { cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); if (cmp < 0) { - zfree(&cmds->names[cj]); - cmds->names[cj++] = cmds->names[ci++]; + if (ci == cj) { + ci++; + cj++; + } else { + zfree(&cmds->names[cj]); + cmds->names[cj++] = cmds->names[ci++]; + } } else if (cmp == 0) { ci++; ei++; @@ -77,10 +82,11 @@ void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) ei++; } } - - while (ci < cmds->cnt) { - zfree(&cmds->names[cj]); - cmds->names[cj++] = cmds->names[ci++]; + if (ci != cj) { + while (ci < cmds->cnt) { + zfree(&cmds->names[cj]); + cmds->names[cj++] = cmds->names[ci++]; + } } for (ci = cj; ci < cmds->cnt; ci++) zfree(&cmds->names[ci]); -- cgit v1.2.3 From 5fc52248559802f0f1ac10b3c8729a1568216715 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 11 Jul 2023 14:50:54 +0200 Subject: vmlinux.lds.h: Remove a reference to no longer used sections .text..refcount Sections .text..refcount were previously used to hold an error path code for fast refcount overflow protection on x86, see commit 7a46ec0e2f48 ("locking/refcounts, x86/asm: Implement fast refcount overflow protection") and commit 564c9cc84e2a ("locking/refcounts, x86/asm: Use unique .text section for refcount exceptions"). The code was replaced and removed in commit fb041bb7c0a9 ("locking/refcount: Consolidate implementations of refcount_t") and no sections .text..refcount are present since then. Remove then a relic referencing these sections from TEXT_TEXT to avoid confusing people, like me. This is a non-functional change. Signed-off-by: Petr Pavlu Link: https://lore.kernel.org/r/20230711125054.9000-1-petr.pavlu@suse.com Signed-off-by: Kees Cook --- include/asm-generic/vmlinux.lds.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 0587354ba678..9c59409104f6 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -578,7 +578,6 @@ *(.text.unlikely .text.unlikely.*) \ *(.text.unknown .text.unknown.*) \ NOINSTR_TEXT \ - *(.text..refcount) \ *(.ref.text) \ *(.text.asan.* .text.tsan.*) \ MEM_KEEP(init.text*) \ -- cgit v1.2.3 From bfb5ef2219b7b28a6e328860438eb55027807289 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 12 Jul 2023 09:28:43 +0530 Subject: cpufreq: sparc: Don't mark cpufreq callbacks with __init These callbacks can be called again by the cpufreq core after the driver is initialized and must be kept around. We currently get section mismatch build warnings. Don't mark them with __init. Fixes: dcfce7c2cee4 ("cpufreq: sparc: Don't allocate cpufreq_driver dynamically") Reported-by: Geert Uytterhoeven Signed-off-by: Viresh Kumar Reviewed-by: Randy Dunlap Tested-by: Randy Dunlap # build-tested Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/sparc-us2e-cpufreq.c | 2 +- drivers/cpufreq/sparc-us3-cpufreq.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/sparc-us2e-cpufreq.c b/drivers/cpufreq/sparc-us2e-cpufreq.c index d3510cfdb3eb..2783d3d55fce 100644 --- a/drivers/cpufreq/sparc-us2e-cpufreq.c +++ b/drivers/cpufreq/sparc-us2e-cpufreq.c @@ -269,7 +269,7 @@ static int us2e_freq_target(struct cpufreq_policy *policy, unsigned int index) return smp_call_function_single(cpu, __us2e_freq_target, &index, 1); } -static int __init us2e_freq_cpu_init(struct cpufreq_policy *policy) +static int us2e_freq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; unsigned long clock_tick = sparc64_get_clock_tick(cpu) / 1000; diff --git a/drivers/cpufreq/sparc-us3-cpufreq.c b/drivers/cpufreq/sparc-us3-cpufreq.c index 91d1ed558136..6c3657679a88 100644 --- a/drivers/cpufreq/sparc-us3-cpufreq.c +++ b/drivers/cpufreq/sparc-us3-cpufreq.c @@ -117,7 +117,7 @@ static int us3_freq_target(struct cpufreq_policy *policy, unsigned int index) return smp_call_function_single(cpu, update_safari_cfg, &new_bits, 1); } -static int __init us3_freq_cpu_init(struct cpufreq_policy *policy) +static int us3_freq_cpu_init(struct cpufreq_policy *policy) { unsigned int cpu = policy->cpu; unsigned long clock_tick = sparc64_get_clock_tick(cpu) / 1000; -- cgit v1.2.3 From 963b54df82b6d6206d7def273390bf3f7af558e1 Mon Sep 17 00:00:00 2001 From: "Isaac J. Manjarres" Date: Tue, 11 Jul 2023 12:30:58 -0700 Subject: regmap-irq: Fix out-of-bounds access when allocating config buffers When allocating the 2D array for handling IRQ type registers in regmap_add_irq_chip_fwnode(), the intent is to allocate a matrix with num_config_bases rows and num_config_regs columns. This is currently handled by allocating a buffer to hold a pointer for each row (i.e. num_config_bases). After that, the logic attempts to allocate the memory required to hold the register configuration for each row. However, instead of doing this allocation for each row (i.e. num_config_bases allocations), the logic erroneously does this allocation num_config_regs number of times. This scenario can lead to out-of-bounds accesses when num_config_regs is greater than num_config_bases. Fix this by updating the terminating condition of the loop that allocates the memory for holding the register configuration to allocate memory only for each row in the matrix. Amit Pundir reported a crash that was occurring on his db845c device due to memory corruption (see "Closes" tag for Amit's report). The KASAN report below helped narrow it down to this issue: [ 14.033877][ T1] ================================================================== [ 14.042507][ T1] BUG: KASAN: invalid-access in regmap_add_irq_chip_fwnode+0x594/0x1364 [ 14.050796][ T1] Write of size 8 at addr 06ffff8081021850 by task init/1 [ 14.242004][ T1] The buggy address belongs to the object at ffffff8081021850 [ 14.242004][ T1] which belongs to the cache kmalloc-8 of size 8 [ 14.255669][ T1] The buggy address is located 0 bytes inside of [ 14.255669][ T1] 8-byte region [ffffff8081021850, ffffff8081021858) Fixes: faa87ce9196d ("regmap-irq: Introduce config registers for irq types") Reported-by: Amit Pundir Closes: https://lore.kernel.org/all/CAMi1Hd04mu6JojT3y6wyN2YeVkPR5R3qnkKJ8iR8if_YByCn4w@mail.gmail.com/ Tested-by: John Stultz Tested-by: Amit Pundir # tested on Dragonboard 845c Cc: stable@vger.kernel.org # v6.0+ Cc: Aidan MacDonald Cc: Saravana Kannan Cc: Catalin Marinas Signed-off-by: "Isaac J. Manjarres" Link: https://lore.kernel.org/r/20230711193059.2480971-1-isaacmanjarres@google.com Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index ced0dcf86e0b..45fd13ef13fc 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -717,7 +717,7 @@ int regmap_add_irq_chip_fwnode(struct fwnode_handle *fwnode, if (!d->config_buf) goto err_alloc; - for (i = 0; i < chip->num_config_regs; i++) { + for (i = 0; i < chip->num_config_bases; i++) { d->config_buf[i] = kcalloc(chip->num_config_regs, sizeof(**d->config_buf), GFP_KERNEL); -- cgit v1.2.3 From 9ec3c5517e22a12d2ff1b71e844f7913641460c6 Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Tue, 11 Jul 2023 17:20:20 +0900 Subject: spi: s3c64xx: clear loopback bit after loopback test When SPI loopback transfer is performed, S3C64XX_SPI_MODE_SELF_LOOPBACK bit still remained. It works as loopback even if the next transfer is not spi loopback mode. If not SPI_LOOP, needs to clear S3C64XX_SPI_MODE_SELF_LOOPBACK bit. Signed-off-by: Jaewon Kim Fixes: ffb7bcd3b27e ("spi: s3c64xx: support loopback mode") Reviewed-by: Chanho Park Link: https://lore.kernel.org/r/20230711082020.138165-1-jaewon02.kim@samsung.com Signed-off-by: Mark Brown --- drivers/spi/spi-s3c64xx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-s3c64xx.c b/drivers/spi/spi-s3c64xx.c index fd55697144cc..b6c2659a66ca 100644 --- a/drivers/spi/spi-s3c64xx.c +++ b/drivers/spi/spi-s3c64xx.c @@ -684,6 +684,8 @@ static int s3c64xx_spi_config(struct s3c64xx_spi_driver_data *sdd) if ((sdd->cur_mode & SPI_LOOP) && sdd->port_conf->has_loopback) val |= S3C64XX_SPI_MODE_SELF_LOOPBACK; + else + val &= ~S3C64XX_SPI_MODE_SELF_LOOPBACK; writel(val, regs + S3C64XX_SPI_MODE_CFG); -- cgit v1.2.3 From 2bbc72ffc4de803f6265119963aa7aac6559960f Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sat, 8 Jul 2023 21:53:08 +0200 Subject: MAINTAINERS: add myself for spi-bcm63xx I noticed the driver is unclaimed. Since I was the last one doing substantial work on it, add me as the maintainer. As it is only found in legacy products, mark it as "Odd Fixes" instead of "Maintained". Signed-off-by: Jonas Gorski Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20230708195309.72767-1-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- MAINTAINERS | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 5afc1b61daa4..9881f1163805 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4169,6 +4169,13 @@ F: Documentation/devicetree/bindings/spi/brcm,bcm63xx-hsspi.yaml F: drivers/spi/spi-bcm63xx-hsspi.c F: drivers/spi/spi-bcmbca-hsspi.c +BROADCOM BCM6348/BCM6358 SPI controller DRIVER +M: Jonas Gorski +L: linux-spi@vger.kernel.org +S: Odd Fixes +F: Documentation/devicetree/bindings/spi/spi-bcm63xx.txt +F: drivers/spi/spi-bcm63xx.c + BROADCOM ETHERNET PHY DRIVERS M: Florian Fainelli R: Broadcom internal kernel review list -- cgit v1.2.3 From 54ccc8758ef4d29de9e8fdb711c852abbdd4103a Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Sat, 8 Jul 2023 21:53:09 +0200 Subject: mailmap: add entry for Jonas Gorski The openwrt.org email address is long defunct, but still pop ups from time to time when asking get_maintainer.pl. So add an entry to my currently used address. Signed-off-by: Jonas Gorski Link: https://lore.kernel.org/r/20230708195309.72767-2-jonas.gorski@gmail.com Signed-off-by: Mark Brown --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index bf076bbc36b1..e1f8c928a5c2 100644 --- a/.mailmap +++ b/.mailmap @@ -237,6 +237,7 @@ John Paul Adrian Glaubitz John Stultz +Jonas Gorski Jordan Crouse -- cgit v1.2.3 From 66843b14fb71825fdd73ab12f6594f2243b402be Mon Sep 17 00:00:00 2001 From: Eric Lin Date: Mon, 10 Jul 2023 15:43:28 +0000 Subject: perf: RISC-V: Remove PERF_HES_STOPPED flag checking in riscv_pmu_start() Since commit 096b52fd2bb4 ("perf: RISC-V: throttle perf events") the perf_sample_event_took() function was added to report time spent in overflow interrupts. If the interrupt takes too long, the perf framework will lower the sysctl_perf_event_sample_rate and max_samples_per_tick. When hwc->interrupts is larger than max_samples_per_tick, the hwc->interrupts will be set to MAX_INTERRUPTS, and events will be throttled within the __perf_event_account_interrupt() function. However, the RISC-V PMU driver doesn't call riscv_pmu_stop() to update the PERF_HES_STOPPED flag after perf_event_overflow() in pmu_sbi_ovf_handler() function to avoid throttling. When the perf framework unthrottled the event in the timer interrupt handler, it triggers riscv_pmu_start() function and causes a WARN_ON_ONCE() warning, as shown below: ------------[ cut here ]------------ WARNING: CPU: 0 PID: 240 at drivers/perf/riscv_pmu.c:184 riscv_pmu_start+0x7c/0x8e Modules linked in: CPU: 0 PID: 240 Comm: ls Not tainted 6.4-rc4-g19d0788e9ef2 #1 Hardware name: SiFive (DT) epc : riscv_pmu_start+0x7c/0x8e ra : riscv_pmu_start+0x28/0x8e epc : ffffffff80aef864 ra : ffffffff80aef810 sp : ffff8f80004db6f0 gp : ffffffff81c83750 tp : ffffaf80069f9bc0 t0 : ffff8f80004db6c0 t1 : 0000000000000000 t2 : 000000000000001f s0 : ffff8f80004db720 s1 : ffffaf8008ca1068 a0 : 0000ffffffffffff a1 : 0000000000000000 a2 : 0000000000000001 a3 : 0000000000000870 a4 : 0000000000000000 a5 : 0000000000000000 a6 : 0000000000000840 a7 : 0000000000000030 s2 : 0000000000000000 s3 : ffffaf8005165800 s4 : ffffaf800424da00 s5 : ffffffffffffffff s6 : ffffffff81cc7590 s7 : 0000000000000000 s8 : 0000000000000006 s9 : 0000000000000001 s10: ffffaf807efbc340 s11: ffffaf807efbbf00 t3 : ffffaf8006a16028 t4 : 00000000dbfbb796 t5 : 0000000700000000 t6 : ffffaf8005269870 status: 0000000200000100 badaddr: 0000000000000000 cause: 0000000000000003 [] riscv_pmu_start+0x7c/0x8e [] perf_adjust_freq_unthr_context+0x15e/0x174 [] perf_event_task_tick+0x88/0x9c [] scheduler_tick+0xfe/0x27c [] update_process_times+0x9a/0xba [] tick_sched_handle+0x32/0x66 [] tick_sched_timer+0x64/0xb0 [] __hrtimer_run_queues+0x156/0x2f4 [] hrtimer_interrupt+0xe2/0x1fe [] riscv_timer_interrupt+0x38/0x42 [] handle_percpu_devid_irq+0x90/0x1d2 [] generic_handle_domain_irq+0x28/0x36 After referring other PMU drivers like Arm, Loongarch, Csky, and Mips, they don't call *_pmu_stop() to update with PERF_HES_STOPPED flag after perf_event_overflow() function nor do they add PERF_HES_STOPPED flag checking in *_pmu_start() which don't cause this warning. Thus, it's recommended to remove this unnecessary check in riscv_pmu_start() function to prevent this warning. Signed-off-by: Eric Lin Link: https://lore.kernel.org/r/20230710154328.19574-1-eric.lin@sifive.com Fixes: 096b52fd2bb4 ("perf: RISC-V: throttle perf events") Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- drivers/perf/riscv_pmu.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/perf/riscv_pmu.c b/drivers/perf/riscv_pmu.c index ebca5eab9c9b..56897d4d4fd3 100644 --- a/drivers/perf/riscv_pmu.c +++ b/drivers/perf/riscv_pmu.c @@ -181,9 +181,6 @@ void riscv_pmu_start(struct perf_event *event, int flags) uint64_t max_period = riscv_pmu_ctr_get_width_mask(event); u64 init_val; - if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) - return; - if (flags & PERF_EF_RELOAD) WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); -- cgit v1.2.3 From b690e266dae2f85f4dfea21fa6a05e3500a51054 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Mon, 10 Jul 2023 01:10:36 +0800 Subject: riscv: mm: fix truncation warning on RV32 lkp reports below sparse warning when building for RV32: arch/riscv/mm/init.c:1204:48: sparse: warning: cast truncates bits from constant value (100000000 becomes 0) IMO, the reason we didn't see this truncates bug in real world is "0" means MEMBLOCK_ALLOC_ACCESSIBLE in memblock and there's no RV32 HW with more than 4GB memory. Fix it anyway to make sparse happy. Fixes: decf89f86ecd ("riscv: try to allocate crashkern region from 32bit addressible memory") Signed-off-by: Jisheng Zhang Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202306080034.SLiCiOMn-lkp@intel.com/ Link: https://lore.kernel.org/r/20230709171036.1906-1-jszhang@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index 70fb31960b63..9ce504737d18 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -1346,7 +1346,7 @@ static void __init reserve_crashkernel(void) */ crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, search_start, - min(search_end, (unsigned long) SZ_4G)); + min(search_end, (unsigned long)(SZ_4G - 1))); if (crash_base == 0) { /* Try again without restricting region to 32bit addressible memory */ crash_base = memblock_phys_alloc_range(crash_size, PMD_SIZE, -- cgit v1.2.3 From d94303699921bda8141ad33554ae55b615ddd149 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Wed, 28 Jun 2023 23:22:46 +0200 Subject: drm/nouveau/disp: fix HDMI on gt215+ Cc: Ben Skeggs Cc: Lyude Paul Fixes: f530bc60a30b ("drm/nouveau/disp: move HDMI config into acquire + infoframe methods") Signed-off-by: Karol Herbst Reviewed-by: Ben Skeggs Link: https://patchwork.freedesktop.org/patch/msgid/20230628212248.3798605-1-kherbst@redhat.com Signed-off-by: Karol Herbst --- drivers/gpu/drm/nouveau/nvkm/engine/disp/gt215.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/gt215.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/gt215.c index a2c7c6f83dcd..506ffbe7b842 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/gt215.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/gt215.c @@ -125,7 +125,7 @@ gt215_sor_hdmi_infoframe_avi(struct nvkm_ior *ior, int head, void *data, u32 siz pack_hdmi_infoframe(&avi, data, size); nvkm_mask(device, 0x61c520 + soff, 0x00000001, 0x00000000); - if (size) + if (!size) return; nvkm_wr32(device, 0x61c528 + soff, avi.header); -- cgit v1.2.3 From c177872cb056e0b499af4717d8d1977017fd53df Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 30 Jun 2023 18:06:45 +0200 Subject: drm/nouveau/disp/g94: enable HDMI Cc: Ben Skeggs Cc: Lyude Paul Fixes: f530bc60a30b ("drm/nouveau/disp: move HDMI config into acquire + infoframe methods") Signed-off-by: Karol Herbst Reviewed-by: Ben Skeggs Link: https://patchwork.freedesktop.org/patch/msgid/20230630160645.3984596-1-kherbst@redhat.com Signed-off-by: Karol Herbst --- drivers/gpu/drm/nouveau/nvkm/engine/disp/g94.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/engine/disp/g94.c b/drivers/gpu/drm/nouveau/nvkm/engine/disp/g94.c index a4853c4e5ee3..67ef889a0c5f 100644 --- a/drivers/gpu/drm/nouveau/nvkm/engine/disp/g94.c +++ b/drivers/gpu/drm/nouveau/nvkm/engine/disp/g94.c @@ -295,6 +295,7 @@ g94_sor = { .clock = nv50_sor_clock, .war_2 = g94_sor_war_2, .war_3 = g94_sor_war_3, + .hdmi = &g84_sor_hdmi, .dp = &g94_sor_dp, }; -- cgit v1.2.3 From b718ae835bd5de891ff6fefa290940b7613d2b07 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 12 Jul 2023 07:54:59 -0700 Subject: nvme: warn only once for legacy uuid attribute Report the legacy fallback behavior for uuid attributes just once instead of logging repeated warnings for the same condition every time the attribute is read. The old behavior is too spamy on the kernel logs. Cc: Johannes Thumshirn Reported-by: Breno Leitao Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/sysfs.c b/drivers/nvme/host/sysfs.c index 45e91811f905..212e1b05d298 100644 --- a/drivers/nvme/host/sysfs.c +++ b/drivers/nvme/host/sysfs.c @@ -92,7 +92,7 @@ static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, * we have no UUID set */ if (uuid_is_null(&ids->uuid)) { - dev_warn_ratelimited(dev, + dev_warn_once(dev, "No UUID available providing old NGUID\n"); return sysfs_emit(buf, "%pU\n", ids->nguid); } -- cgit v1.2.3 From 733ba346d941d37e0814bac37b6a5c188ac3c9b2 Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Wed, 12 Jul 2023 19:18:37 +0800 Subject: nvme: fix parameter check in nvme_fault_inject_init() Make IS_ERR() judge the debugfs_create_dir() function return. Signed-off-by: Minjie Du Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/fault_inject.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/fault_inject.c b/drivers/nvme/host/fault_inject.c index 83d2e6860d38..1ba10a5c656d 100644 --- a/drivers/nvme/host/fault_inject.c +++ b/drivers/nvme/host/fault_inject.c @@ -27,7 +27,7 @@ void nvme_fault_inject_init(struct nvme_fault_inject *fault_inj, /* create debugfs directory and attribute */ parent = debugfs_create_dir(dev_name, NULL); - if (!parent) { + if (IS_ERR(parent)) { pr_warn("%s: failed to create debugfs directory\n", dev_name); return; } -- cgit v1.2.3 From d934e537c14bfe1227ced6341472571f354383e8 Mon Sep 17 00:00:00 2001 From: Yang Wang Date: Tue, 20 Jun 2023 17:05:25 +0800 Subject: drm/amd/pm: fix smu i2c data read risk the smu driver_table is used for all types of smu tables data transcation (e.g: PPtable, Metrics, i2c, Ecc..). it is necessary to hold this lock to avoiding data tampering during the i2c read operation. Signed-off-by: Yang Wang Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 2 +- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c index 9cd005131f56..3bb18396d2f9 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c @@ -2113,7 +2113,6 @@ static int arcturus_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_cmn_update_table(smu, SMU_TABLE_I2C_COMMANDS, 0, req, true); - mutex_unlock(&adev->pm.mutex); if (r) goto fail; @@ -2130,6 +2129,7 @@ static int arcturus_i2c_xfer(struct i2c_adapter *i2c_adap, } r = num_msgs; fail: + mutex_unlock(&adev->pm.mutex); kfree(req); return r; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c index c94d825a871b..95f6d821bacb 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/navi10_ppt.c @@ -3021,7 +3021,6 @@ static int navi10_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_cmn_update_table(smu, SMU_TABLE_I2C_COMMANDS, 0, req, true); - mutex_unlock(&adev->pm.mutex); if (r) goto fail; @@ -3038,6 +3037,7 @@ static int navi10_i2c_xfer(struct i2c_adapter *i2c_adap, } r = num_msgs; fail: + mutex_unlock(&adev->pm.mutex); kfree(req); return r; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index f7ed3e655e39..8fe2e1716da4 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -3842,7 +3842,6 @@ static int sienna_cichlid_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_cmn_update_table(smu, SMU_TABLE_I2C_COMMANDS, 0, req, true); - mutex_unlock(&adev->pm.mutex); if (r) goto fail; @@ -3859,6 +3858,7 @@ static int sienna_cichlid_i2c_xfer(struct i2c_adapter *i2c_adap, } r = num_msgs; fail: + mutex_unlock(&adev->pm.mutex); kfree(req); return r; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c index e80f122d8aec..ce50ef46e73f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/aldebaran_ppt.c @@ -1525,7 +1525,6 @@ static int aldebaran_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_cmn_update_table(smu, SMU_TABLE_I2C_COMMANDS, 0, req, true); - mutex_unlock(&adev->pm.mutex); if (r) goto fail; @@ -1542,6 +1541,7 @@ static int aldebaran_i2c_xfer(struct i2c_adapter *i2c_adap, } r = num_msgs; fail: + mutex_unlock(&adev->pm.mutex); kfree(req); return r; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 124287cbbff8..1d995f53aaab 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -2320,7 +2320,6 @@ static int smu_v13_0_0_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_cmn_update_table(smu, SMU_TABLE_I2C_COMMANDS, 0, req, true); - mutex_unlock(&adev->pm.mutex); if (r) goto fail; @@ -2337,6 +2336,7 @@ static int smu_v13_0_0_i2c_xfer(struct i2c_adapter *i2c_adap, } r = num_msgs; fail: + mutex_unlock(&adev->pm.mutex); kfree(req); return r; } diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c index 6ef12252beb5..1ac552142763 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_6_ppt.c @@ -1763,7 +1763,6 @@ static int smu_v13_0_6_i2c_xfer(struct i2c_adapter *i2c_adap, } mutex_lock(&adev->pm.mutex); r = smu_v13_0_6_request_i2c_xfer(smu, req); - mutex_unlock(&adev->pm.mutex); if (r) goto fail; @@ -1780,6 +1779,7 @@ static int smu_v13_0_6_i2c_xfer(struct i2c_adapter *i2c_adap, } r = num_msgs; fail: + mutex_unlock(&adev->pm.mutex); kfree(req); return r; } -- cgit v1.2.3 From 8a774fe912ff09e39c2d3a3589c729330113f388 Mon Sep 17 00:00:00 2001 From: gaba Date: Thu, 2 Mar 2023 19:03:56 -0500 Subject: drm/amdgpu: avoid restore process run into dead loop. In restore process worker, pinned BO cause update PTE fail, then the function re-schedule the restore_work. This will generate dead loop. Signed-off-by: gaba Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c index f61527b800e6..a7f314ddd173 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c @@ -2881,6 +2881,9 @@ int amdgpu_amdkfd_gpuvm_restore_process_bos(void *info, struct dma_fence **ef) if (!attachment->is_mapped) continue; + if (attachment->bo_va->base.bo->tbo.pin_count) + continue; + kfd_mem_dmaunmap_attachment(mem, attachment); ret = update_gpuvm_pte(mem, attachment, &sync_obj); if (ret) { -- cgit v1.2.3 From dcb489bae65d92cfd26da22c7a0d6665b06ecc63 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Fri, 7 Jul 2023 14:31:34 -0500 Subject: drm/amd/pm: share the code around SMU13 pcie parameters update So that SMU13.0.0 and SMU13.0.7 do not need to have one copy each. Signed-off-by: Evan Quan Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x --- drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h | 4 +++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 31 ++++++++++++++++++++ .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c | 33 +--------------------- .../gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c | 33 +--------------------- 4 files changed, 37 insertions(+), 64 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h index 6a0ac0bbaace..355c156d871a 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h +++ b/drivers/gpu/drm/amd/pm/swsmu/inc/smu_v13_0.h @@ -295,5 +295,9 @@ int smu_v13_0_get_pptable_from_firmware(struct smu_context *smu, uint32_t *size, uint32_t pptable_id); +int smu_v13_0_update_pcie_parameters(struct smu_context *smu, + uint32_t pcie_gen_cap, + uint32_t pcie_width_cap); + #endif #endif diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 3856da6c3f3d..2ef877c2cb59 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -2424,3 +2424,34 @@ int smu_v13_0_mode1_reset(struct smu_context *smu) return ret; } + +int smu_v13_0_update_pcie_parameters(struct smu_context *smu, + uint32_t pcie_gen_cap, + uint32_t pcie_width_cap) +{ + struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; + struct smu_13_0_pcie_table *pcie_table = + &dpm_context->dpm_tables.pcie_table; + uint32_t smu_pcie_arg; + int ret, i; + + for (i = 0; i < pcie_table->num_of_link_levels; i++) { + if (pcie_table->pcie_gen[i] > pcie_gen_cap) + pcie_table->pcie_gen[i] = pcie_gen_cap; + if (pcie_table->pcie_lane[i] > pcie_width_cap) + pcie_table->pcie_lane[i] = pcie_width_cap; + + smu_pcie_arg = i << 16; + smu_pcie_arg |= pcie_table->pcie_gen[i] << 8; + smu_pcie_arg |= pcie_table->pcie_lane[i]; + + ret = smu_cmn_send_smc_msg_with_param(smu, + SMU_MSG_OverridePcieParameters, + smu_pcie_arg, + NULL); + if (ret) + return ret; + } + + return 0; +} diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c index 1d995f53aaab..b9bde5fa8f8f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_0_ppt.c @@ -1645,37 +1645,6 @@ static int smu_v13_0_0_force_clk_levels(struct smu_context *smu, return ret; } -static int smu_v13_0_0_update_pcie_parameters(struct smu_context *smu, - uint32_t pcie_gen_cap, - uint32_t pcie_width_cap) -{ - struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; - struct smu_13_0_pcie_table *pcie_table = - &dpm_context->dpm_tables.pcie_table; - uint32_t smu_pcie_arg; - int ret, i; - - for (i = 0; i < pcie_table->num_of_link_levels; i++) { - if (pcie_table->pcie_gen[i] > pcie_gen_cap) - pcie_table->pcie_gen[i] = pcie_gen_cap; - if (pcie_table->pcie_lane[i] > pcie_width_cap) - pcie_table->pcie_lane[i] = pcie_width_cap; - - smu_pcie_arg = i << 16; - smu_pcie_arg |= pcie_table->pcie_gen[i] << 8; - smu_pcie_arg |= pcie_table->pcie_lane[i]; - - ret = smu_cmn_send_smc_msg_with_param(smu, - SMU_MSG_OverridePcieParameters, - smu_pcie_arg, - NULL); - if (ret) - return ret; - } - - return 0; -} - static const struct smu_temperature_range smu13_thermal_policy[] = { {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000}, { 120000, 120000, 120000, 120000, 120000, 120000, 120000, 120000, 120000}, @@ -2654,7 +2623,7 @@ static const struct pptable_funcs smu_v13_0_0_ppt_funcs = { .feature_is_enabled = smu_cmn_feature_is_enabled, .print_clk_levels = smu_v13_0_0_print_clk_levels, .force_clk_levels = smu_v13_0_0_force_clk_levels, - .update_pcie_parameters = smu_v13_0_0_update_pcie_parameters, + .update_pcie_parameters = smu_v13_0_update_pcie_parameters, .get_thermal_temperature_range = smu_v13_0_0_get_thermal_temperature_range, .register_irq_handler = smu_v13_0_register_irq_handler, .enable_thermal_alert = smu_v13_0_enable_thermal_alert, diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c index cda4e818aab7..3ba02131e682 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0_7_ppt.c @@ -1635,37 +1635,6 @@ static int smu_v13_0_7_force_clk_levels(struct smu_context *smu, return ret; } -static int smu_v13_0_7_update_pcie_parameters(struct smu_context *smu, - uint32_t pcie_gen_cap, - uint32_t pcie_width_cap) -{ - struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; - struct smu_13_0_pcie_table *pcie_table = - &dpm_context->dpm_tables.pcie_table; - uint32_t smu_pcie_arg; - int ret, i; - - for (i = 0; i < pcie_table->num_of_link_levels; i++) { - if (pcie_table->pcie_gen[i] > pcie_gen_cap) - pcie_table->pcie_gen[i] = pcie_gen_cap; - if (pcie_table->pcie_lane[i] > pcie_width_cap) - pcie_table->pcie_lane[i] = pcie_width_cap; - - smu_pcie_arg = i << 16; - smu_pcie_arg |= pcie_table->pcie_gen[i] << 8; - smu_pcie_arg |= pcie_table->pcie_lane[i]; - - ret = smu_cmn_send_smc_msg_with_param(smu, - SMU_MSG_OverridePcieParameters, - smu_pcie_arg, - NULL); - if (ret) - return ret; - } - - return 0; -} - static const struct smu_temperature_range smu13_thermal_policy[] = { {-273150, 99000, 99000, -273150, 99000, 99000, -273150, 99000, 99000}, @@ -2234,7 +2203,7 @@ static const struct pptable_funcs smu_v13_0_7_ppt_funcs = { .feature_is_enabled = smu_cmn_feature_is_enabled, .print_clk_levels = smu_v13_0_7_print_clk_levels, .force_clk_levels = smu_v13_0_7_force_clk_levels, - .update_pcie_parameters = smu_v13_0_7_update_pcie_parameters, + .update_pcie_parameters = smu_v13_0_update_pcie_parameters, .get_thermal_temperature_range = smu_v13_0_7_get_thermal_temperature_range, .register_irq_handler = smu_v13_0_register_irq_handler, .enable_thermal_alert = smu_v13_0_enable_thermal_alert, -- cgit v1.2.3 From 31c7a3b378a136adc63296a2ff17645896fcf303 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 7 Jul 2023 14:31:35 -0500 Subject: drm/amd/pm: conditionally disable pcie lane/speed switching for SMU13 Intel platforms such as Sapphire Rapids and Raptor Lake don't support dynamic pcie lane or speed switching. This limitation seems to carry over from one generation to another. To be safer, disable dynamic pcie lane width and speed switching when running on an Intel platform. Link: https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2663 Co-developed-by: Evan Quan Signed-off-by: Evan Quan Signed-off-by: Mario Limonciello Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 42 ++++++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 2ef877c2cb59..cf7e729020ab 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -2425,6 +2425,25 @@ int smu_v13_0_mode1_reset(struct smu_context *smu) return ret; } +/* + * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic + * speed switching. Until we have confirmation from Intel that a specific host + * supports it, it's safer that we keep it disabled for all. + * + * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ + * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 + */ +static bool smu_v13_0_is_pcie_dynamic_switching_supported(void) +{ +#if IS_ENABLED(CONFIG_X86) + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor == X86_VENDOR_INTEL) + return false; +#endif + return true; +} + int smu_v13_0_update_pcie_parameters(struct smu_context *smu, uint32_t pcie_gen_cap, uint32_t pcie_width_cap) @@ -2432,15 +2451,32 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu, struct smu_13_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; struct smu_13_0_pcie_table *pcie_table = &dpm_context->dpm_tables.pcie_table; + int num_of_levels = pcie_table->num_of_link_levels; uint32_t smu_pcie_arg; int ret, i; - for (i = 0; i < pcie_table->num_of_link_levels; i++) { - if (pcie_table->pcie_gen[i] > pcie_gen_cap) + if (!smu_v13_0_is_pcie_dynamic_switching_supported()) { + if (pcie_table->pcie_gen[num_of_levels - 1] < pcie_gen_cap) + pcie_gen_cap = pcie_table->pcie_gen[num_of_levels - 1]; + + if (pcie_table->pcie_lane[num_of_levels - 1] < pcie_width_cap) + pcie_width_cap = pcie_table->pcie_lane[num_of_levels - 1]; + + /* Force all levels to use the same settings */ + for (i = 0; i < num_of_levels; i++) { pcie_table->pcie_gen[i] = pcie_gen_cap; - if (pcie_table->pcie_lane[i] > pcie_width_cap) pcie_table->pcie_lane[i] = pcie_width_cap; + } + } else { + for (i = 0; i < num_of_levels; i++) { + if (pcie_table->pcie_gen[i] > pcie_gen_cap) + pcie_table->pcie_gen[i] = pcie_gen_cap; + if (pcie_table->pcie_lane[i] > pcie_width_cap) + pcie_table->pcie_lane[i] = pcie_width_cap; + } + } + for (i = 0; i < num_of_levels; i++) { smu_pcie_arg = i << 16; smu_pcie_arg |= pcie_table->pcie_gen[i] << 8; smu_pcie_arg |= pcie_table->pcie_lane[i]; -- cgit v1.2.3 From 188623076d0f1a500583d392b6187056bf7cc71a Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 7 Jul 2023 21:26:08 -0500 Subject: drm/amd: Move helper for dynamic speed switch check out of smu13 This helper is used for checking if the connected host supports the feature, it can be moved into generic code to be used by other smu implementations as well. Signed-off-by: Mario Limonciello Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 19 +++++++++++++++++++ drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 21 +-------------------- 3 files changed, 21 insertions(+), 20 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2f9c14aca73c..a3b86b86dc47 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1296,6 +1296,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, void amdgpu_device_pci_config_reset(struct amdgpu_device *adev); int amdgpu_device_pci_reset(struct amdgpu_device *adev); bool amdgpu_device_need_post(struct amdgpu_device *adev); +bool amdgpu_device_pcie_dynamic_switching_supported(void); bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev); bool amdgpu_device_aspm_support_quirk(void); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index a92c6189b4b6..a2cdde0ca0a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1458,6 +1458,25 @@ bool amdgpu_device_need_post(struct amdgpu_device *adev) return true; } +/* + * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic + * speed switching. Until we have confirmation from Intel that a specific host + * supports it, it's safer that we keep it disabled for all. + * + * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ + * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 + */ +bool amdgpu_device_pcie_dynamic_switching_supported(void) +{ +#if IS_ENABLED(CONFIG_X86) + struct cpuinfo_x86 *c = &cpu_data(0); + + if (c->x86_vendor == X86_VENDOR_INTEL) + return false; +#endif + return true; +} + /** * amdgpu_device_should_use_aspm - check if the device should program ASPM * diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index cf7e729020ab..9b62b45ebb7f 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -2425,25 +2425,6 @@ int smu_v13_0_mode1_reset(struct smu_context *smu) return ret; } -/* - * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic - * speed switching. Until we have confirmation from Intel that a specific host - * supports it, it's safer that we keep it disabled for all. - * - * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/ - * https://gitlab.freedesktop.org/drm/amd/-/issues/2663 - */ -static bool smu_v13_0_is_pcie_dynamic_switching_supported(void) -{ -#if IS_ENABLED(CONFIG_X86) - struct cpuinfo_x86 *c = &cpu_data(0); - - if (c->x86_vendor == X86_VENDOR_INTEL) - return false; -#endif - return true; -} - int smu_v13_0_update_pcie_parameters(struct smu_context *smu, uint32_t pcie_gen_cap, uint32_t pcie_width_cap) @@ -2455,7 +2436,7 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu, uint32_t smu_pcie_arg; int ret, i; - if (!smu_v13_0_is_pcie_dynamic_switching_supported()) { + if (!amdgpu_device_pcie_dynamic_switching_supported()) { if (pcie_table->pcie_gen[num_of_levels - 1] < pcie_gen_cap) pcie_gen_cap = pcie_table->pcie_gen[num_of_levels - 1]; -- cgit v1.2.3 From e701156ccc6c7a5f104a968dda74cd6434178712 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 7 Jul 2023 21:26:09 -0500 Subject: drm/amd: Align SMU11 SMU_MSG_OverridePcieParameters implementation with SMU13 SMU13 overrides dynamic PCIe lane width and dynamic speed by when on certain hosts. commit 38e4ced80479 ("drm/amd/pm: conditionally disable pcie lane switching for some sienna_cichlid SKUs") worked around this issue by setting up certain SKUs to set up certain limits, but the same fundamental problem with those hosts affects all SMU11 implmentations as well, so align the SMU11 and SMU13 driver handling. Signed-off-by: Mario Limonciello Reviewed-by: Evan Quan Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org # 6.1.x --- .../drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c | 89 +++++----------------- 1 file changed, 18 insertions(+), 71 deletions(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c index 8fe2e1716da4..f6599c00a6fd 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/sienna_cichlid_ppt.c @@ -2077,89 +2077,36 @@ static int sienna_cichlid_display_disable_memory_clock_switch(struct smu_context return ret; } -static void sienna_cichlid_get_override_pcie_settings(struct smu_context *smu, - uint32_t *gen_speed_override, - uint32_t *lane_width_override) -{ - struct amdgpu_device *adev = smu->adev; - - *gen_speed_override = 0xff; - *lane_width_override = 0xff; - - switch (adev->pdev->device) { - case 0x73A0: - case 0x73A1: - case 0x73A2: - case 0x73A3: - case 0x73AB: - case 0x73AE: - /* Bit 7:0: PCIE lane width, 1 to 7 corresponds is x1 to x32 */ - *lane_width_override = 6; - break; - case 0x73E0: - case 0x73E1: - case 0x73E3: - *lane_width_override = 4; - break; - case 0x7420: - case 0x7421: - case 0x7422: - case 0x7423: - case 0x7424: - *lane_width_override = 3; - break; - default: - break; - } -} - -#define MAX(a, b) ((a) > (b) ? (a) : (b)) - static int sienna_cichlid_update_pcie_parameters(struct smu_context *smu, uint32_t pcie_gen_cap, uint32_t pcie_width_cap) { struct smu_11_0_dpm_context *dpm_context = smu->smu_dpm.dpm_context; struct smu_11_0_pcie_table *pcie_table = &dpm_context->dpm_tables.pcie_table; - uint32_t gen_speed_override, lane_width_override; - uint8_t *table_member1, *table_member2; - uint32_t min_gen_speed, max_gen_speed; - uint32_t min_lane_width, max_lane_width; - uint32_t smu_pcie_arg; + u32 smu_pcie_arg; int ret, i; - GET_PPTABLE_MEMBER(PcieGenSpeed, &table_member1); - GET_PPTABLE_MEMBER(PcieLaneCount, &table_member2); - - sienna_cichlid_get_override_pcie_settings(smu, - &gen_speed_override, - &lane_width_override); + /* PCIE gen speed and lane width override */ + if (!amdgpu_device_pcie_dynamic_switching_supported()) { + if (pcie_table->pcie_gen[NUM_LINK_LEVELS - 1] < pcie_gen_cap) + pcie_gen_cap = pcie_table->pcie_gen[NUM_LINK_LEVELS - 1]; - /* PCIE gen speed override */ - if (gen_speed_override != 0xff) { - min_gen_speed = MIN(pcie_gen_cap, gen_speed_override); - max_gen_speed = MIN(pcie_gen_cap, gen_speed_override); - } else { - min_gen_speed = MAX(0, table_member1[0]); - max_gen_speed = MIN(pcie_gen_cap, table_member1[1]); - min_gen_speed = min_gen_speed > max_gen_speed ? - max_gen_speed : min_gen_speed; - } - pcie_table->pcie_gen[0] = min_gen_speed; - pcie_table->pcie_gen[1] = max_gen_speed; + if (pcie_table->pcie_lane[NUM_LINK_LEVELS - 1] < pcie_width_cap) + pcie_width_cap = pcie_table->pcie_lane[NUM_LINK_LEVELS - 1]; - /* PCIE lane width override */ - if (lane_width_override != 0xff) { - min_lane_width = MIN(pcie_width_cap, lane_width_override); - max_lane_width = MIN(pcie_width_cap, lane_width_override); + /* Force all levels to use the same settings */ + for (i = 0; i < NUM_LINK_LEVELS; i++) { + pcie_table->pcie_gen[i] = pcie_gen_cap; + pcie_table->pcie_lane[i] = pcie_width_cap; + } } else { - min_lane_width = MAX(1, table_member2[0]); - max_lane_width = MIN(pcie_width_cap, table_member2[1]); - min_lane_width = min_lane_width > max_lane_width ? - max_lane_width : min_lane_width; + for (i = 0; i < NUM_LINK_LEVELS; i++) { + if (pcie_table->pcie_gen[i] > pcie_gen_cap) + pcie_table->pcie_gen[i] = pcie_gen_cap; + if (pcie_table->pcie_lane[i] > pcie_width_cap) + pcie_table->pcie_lane[i] = pcie_width_cap; + } } - pcie_table->pcie_lane[0] = min_lane_width; - pcie_table->pcie_lane[1] = max_lane_width; for (i = 0; i < NUM_LINK_LEVELS; i++) { smu_pcie_arg = (i << 16 | -- cgit v1.2.3 From 60e445bdfccbb90c1bc13a92e128e50ba4357b3c Mon Sep 17 00:00:00 2001 From: Michael Liang Date: Fri, 7 Jul 2023 15:21:56 -0600 Subject: nvme-fc: return non-zero status code when fails to create association Return non-zero status code(-EIO) when needed, so re-connecting or deleting controller will be triggered properly. Signed-off-by: Michael Liang Reviewed-by: Caleb Sander Reviewed-by: James Smart Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 691f2df574ce..ad9336343e01 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2551,6 +2551,9 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { __nvme_fc_abort_outstanding_ios(ctrl, true); set_bit(ASSOC_FAILED, &ctrl->flags); + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: transport error during (re)connect\n", + ctrl->cnum); return; } @@ -3110,7 +3113,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) */ ret = nvme_enable_ctrl(&ctrl->ctrl); - if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) + ret = -EIO; + if (ret) goto out_disconnect_admin_queue; ctrl->ctrl.max_segments = ctrl->lport->ops->max_sgl_segments; @@ -3120,7 +3125,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) nvme_unquiesce_admin_queue(&ctrl->ctrl); ret = nvme_init_ctrl_finish(&ctrl->ctrl, false); - if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) + ret = -EIO; + if (ret) goto out_disconnect_admin_queue; /* sanity checks */ @@ -3165,7 +3172,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } - if (ret || test_bit(ASSOC_FAILED, &ctrl->flags)) + if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) + ret = -EIO; + if (ret) goto out_term_aen_ops; changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); @@ -3180,6 +3189,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) out_term_aen_ops: nvme_fc_term_aen_ops(ctrl); out_disconnect_admin_queue: + dev_warn(ctrl->ctrl.device, + "NVME-FC{%d}: create_assoc failed, assoc_id %llx ret %d\n", + ctrl->cnum, ctrl->association_id, ret); /* send a Disconnect(association) LS to fc-nvme target */ nvme_fc_xmt_disconnect_assoc(ctrl); spin_lock_irqsave(&ctrl->lock, flags); -- cgit v1.2.3 From ee6fdc5055e916b1dd497f11260d4901c4c1e55e Mon Sep 17 00:00:00 2001 From: Michael Liang Date: Fri, 7 Jul 2023 15:21:57 -0600 Subject: nvme-fc: fix race between error recovery and creating association There is a small race window between nvme-fc association creation and error recovery. Fix this race condition by protecting accessing to controller state and ASSOC_FAILED flag under nvme-fc controller lock. Signed-off-by: Michael Liang Reviewed-by: Caleb Sander Reviewed-by: James Smart Signed-off-by: Keith Busch --- drivers/nvme/host/fc.c | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index ad9336343e01..1cd2bf82319a 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2548,17 +2548,24 @@ nvme_fc_error_recovery(struct nvme_fc_ctrl *ctrl, char *errmsg) * the controller. Abort any ios on the association and let the * create_association error path resolve things. */ - if (ctrl->ctrl.state == NVME_CTRL_CONNECTING) { - __nvme_fc_abort_outstanding_ios(ctrl, true); + enum nvme_ctrl_state state; + unsigned long flags; + + spin_lock_irqsave(&ctrl->lock, flags); + state = ctrl->ctrl.state; + if (state == NVME_CTRL_CONNECTING) { set_bit(ASSOC_FAILED, &ctrl->flags); + spin_unlock_irqrestore(&ctrl->lock, flags); + __nvme_fc_abort_outstanding_ios(ctrl, true); dev_warn(ctrl->ctrl.device, "NVME-FC{%d}: transport error during (re)connect\n", ctrl->cnum); return; } + spin_unlock_irqrestore(&ctrl->lock, flags); /* Otherwise, only proceed if in LIVE state - e.g. on first error */ - if (ctrl->ctrl.state != NVME_CTRL_LIVE) + if (state != NVME_CTRL_LIVE) return; dev_warn(ctrl->ctrl.device, @@ -3172,12 +3179,16 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl) else ret = nvme_fc_recreate_io_queues(ctrl); } + + spin_lock_irqsave(&ctrl->lock, flags); if (!ret && test_bit(ASSOC_FAILED, &ctrl->flags)) ret = -EIO; - if (ret) + if (ret) { + spin_unlock_irqrestore(&ctrl->lock, flags); goto out_term_aen_ops; - + } changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE); + spin_unlock_irqrestore(&ctrl->lock, flags); ctrl->ctrl.nr_reconnects = 0; -- cgit v1.2.3 From 71a5bb153be104d9175636e95166fd5e37466649 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 12 Jul 2023 08:36:47 -0700 Subject: nvme: ensure disabling pairs with unquiesce If any error handling that disables the controller fails to queue the reset work, like if the state changed to disconnected inbetween, then the failed teardown needs to unquiesce the queues since it's no longer paired with reset_work. Just make sure that the controller can be put into a resetting state prior to starting the disable so that no other handling can change the queue states while recovery is happening. Reported-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8754b4a5c684..8e7dbe0ab890 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1298,9 +1298,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) */ if (nvme_should_reset(dev, csts)) { nvme_warn_reset(dev, csts); - nvme_dev_disable(dev, false); - nvme_reset_ctrl(&dev->ctrl); - return BLK_EH_DONE; + goto disable; } /* @@ -1351,10 +1349,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); nvme_req(req)->flags |= NVME_REQ_CANCELLED; - nvme_dev_disable(dev, false); - nvme_reset_ctrl(&dev->ctrl); - - return BLK_EH_DONE; + goto disable; } if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { @@ -1391,6 +1386,15 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) * as the device then is in a faulty state. */ return BLK_EH_RESET_TIMER; + +disable: + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) + return BLK_EH_DONE; + + nvme_dev_disable(dev, false); + if (nvme_try_sched_reset(&dev->ctrl)) + nvme_unquiesce_io_queues(&dev->ctrl); + return BLK_EH_DONE; } static void nvme_free_queue(struct nvme_queue *nvmeq) @@ -3278,6 +3282,10 @@ static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, case pci_channel_io_frozen: dev_warn(dev->ctrl.device, "frozen state error detected, reset controller\n"); + if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) { + nvme_dev_disable(dev, true); + return PCI_ERS_RESULT_DISCONNECT; + } nvme_dev_disable(dev, false); return PCI_ERS_RESULT_NEED_RESET; case pci_channel_io_perm_failure: @@ -3294,7 +3302,8 @@ static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) dev_info(dev->ctrl.device, "restart after slot reset\n"); pci_restore_state(pdev); - nvme_reset_ctrl(&dev->ctrl); + if (!nvme_try_sched_reset(&dev->ctrl)) + nvme_unquiesce_io_queues(&dev->ctrl); return PCI_ERS_RESULT_RECOVERED; } -- cgit v1.2.3 From ab2dbc7accedc2e98eb7d8b8878d337e3b36c95d Mon Sep 17 00:00:00 2001 From: Palmer Dabbelt Date: Tue, 11 Jul 2023 15:46:00 -0700 Subject: RISC-V: Don't include Zicsr or Zifencei in I from ACPI ACPI ISA strings are based on a specification after Zicsr and Zifencei were split out of I, so we shouldn't be treating them as part of I. We haven't release an ACPI-based kernel yet, so we don't need to worry about compatibility with the old ISA strings. Fixes: 07edc32779e3 ("RISC-V: always report presence of extensions formerly part of the base ISA") Reviewed-by: Conor Dooley Reviewed-by: Sunil V L Link: https://lore.kernel.org/r/20230711224600.10879-1-palmer@rivosinc.com Cc: stable@vger.kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/cpufeature.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index bdcf460ea53d..a8f66c015229 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -317,19 +317,14 @@ void __init riscv_fill_hwcap(void) #undef SET_ISA_EXT_MAP } - /* - * Linux requires the following extensions, so we may as well - * always set them. - */ - set_bit(RISCV_ISA_EXT_ZICSR, isainfo->isa); - set_bit(RISCV_ISA_EXT_ZIFENCEI, isainfo->isa); - /* * These ones were as they were part of the base ISA when the * port & dt-bindings were upstreamed, and so can be set * unconditionally where `i` is in riscv,isa on DT systems. */ if (acpi_disabled) { + set_bit(RISCV_ISA_EXT_ZICSR, isainfo->isa); + set_bit(RISCV_ISA_EXT_ZIFENCEI, isainfo->isa); set_bit(RISCV_ISA_EXT_ZICNTR, isainfo->isa); set_bit(RISCV_ISA_EXT_ZIHPM, isainfo->isa); } -- cgit v1.2.3 From f673b4f5bd13365c8bee2f38c9794b635c73a302 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Wed, 12 Jul 2023 10:33:43 -0700 Subject: block/mq-deadline: Fix a bug in deadline_from_pos() A bug was introduced in deadline_from_pos() while implementing the suggestion to use round_down() in the following code: pos -= bdev_offset_from_zone_start(rq->q->disk->part0, pos); This patch makes deadline_from_pos() use round_down() such that 'pos' is rounded down. Reported-by: Shin'ichiro Kawasaki Closes: https://lore.kernel.org/all/5zthzi3lppvcdp4nemum6qck4gpqbdhvgy4k3qwguhgzxc4quj@amulvgycq67h/ Cc: Christoph Hellwig Cc: Damien Le Moal Fixes: 0effb390c4ba ("block: mq-deadline: Handle requeued requests correctly") Signed-off-by: Bart Van Assche Link: https://lore.kernel.org/r/20230712173344.2994513-1-bvanassche@acm.org Signed-off-by: Jens Axboe --- block/mq-deadline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 6aa5daf7ae32..02a916ba62ee 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -176,7 +176,7 @@ static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, * zoned writes, start searching from the start of a zone. */ if (blk_rq_is_seq_zoned_write(rq)) - pos -= round_down(pos, rq->q->limits.chunk_sectors); + pos = round_down(pos, rq->q->limits.chunk_sectors); while (node) { rq = rb_entry_rq(node); -- cgit v1.2.3 From 5b10c18d1bf9238d842db6c8e86cc0066185c391 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 11 Jul 2023 23:52:50 -0700 Subject: perf parse-events: Avoid SEGV if PMU lookup fails for legacy cache terms libfuzzer found the following command could SEGV: $ perf stat -e cpu/L2,L2/ true This is because the L2 term rewrites the perf_event_attr type to PERF_TYPE_HW_CACHE which then fails the PMU lookup for the second legacy cache term. The new failure is consistent with repeated hardware terms: $ perf stat -e cpu/L2,L2/ true event syntax error: 'cpu/L2,L2/' \___ Failed to find PMU for type 3 Initial error: event syntax error: 'cpu/L2,L2/' \___ Failed to find PMU for type 3 Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events $ perf stat -e cpu/cycles,cycles/ true event syntax error: 'cpu/cycles,cycles/' \___ Failed to find PMU for type 0 Initial error: event syntax error: 'cpu/cycles,cycles/' \___ Failed to find PMU for type 0 Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events Committer testing: Before: $ perf stat -e cpu/L2,L2/ true Segmentation fault (core dumped) $ After: $ perf stat -e cpu/L2,L2/ true event syntax error: 'cpu/L2,L2/' \___ Failed to find PMU for type 3 Initial error: event syntax error: 'cpu/L2,L2/' \___ Failed to find PMU for type 3 Run 'perf list' for a list of valid events Usage: perf stat [] [] -e, --event event selector. use 'perf list' to list available events $ Fixes: 6fd1e5191591f9d5 ("perf parse-events: Support PMUs for legacy cache events") Signed-off-by: Ian Rogers Tested-by: Arnaldo Carvalho de Melo Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Xing Zhengjun Link: https://lore.kernel.org/r/20230712065250.1450306-1-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/parse-events.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 5dcfbf316bf6..acde097e327c 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -1216,6 +1216,14 @@ static int config_term_pmu(struct perf_event_attr *attr, if (term->type_term == PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE) { const struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); + if (!pmu) { + char *err_str; + + if (asprintf(&err_str, "Failed to find PMU for type %d", attr->type) >= 0) + parse_events_error__handle(err, term->err_term, + err_str, /*help=*/NULL); + return -EINVAL; + } if (perf_pmu__supports_legacy_cache(pmu)) { attr->type = PERF_TYPE_HW_CACHE; return parse_events__decode_legacy_cache(term->config, pmu->type, -- cgit v1.2.3 From 938a06c8b7913455073506c33ae3bff029c3c4ef Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Mon, 22 May 2023 22:18:38 +0200 Subject: drm/nouveau/acr: Abort loading ACR if no firmware was found This fixes a NULL pointer access inside nvkm_acr_oneinit in case necessary firmware files couldn't be loaded. Closes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/212 Fixes: 4b569ded09fd ("drm/nouveau/acr/ga102: initial support") Signed-off-by: Karol Herbst Reviewed-by: Dave Airlie Link: https://patchwork.freedesktop.org/patch/msgid/20230522201838.1496622-1-kherbst@redhat.com --- drivers/gpu/drm/nouveau/nvkm/subdev/acr/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/acr/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/acr/base.c index 795f3a649b12..9b8ca4e898f9 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/acr/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/acr/base.c @@ -224,7 +224,7 @@ nvkm_acr_oneinit(struct nvkm_subdev *subdev) u64 falcons; int ret, i; - if (list_empty(&acr->hsfw)) { + if (list_empty(&acr->hsfw) || !acr->func || !acr->func->wpr_layout) { nvkm_debug(subdev, "No HSFW(s)\n"); nvkm_acr_cleanup(acr); return 0; -- cgit v1.2.3 From 835a65f51790e1f72b1ab106ec89db9ac15b47d6 Mon Sep 17 00:00:00 2001 From: Karol Herbst Date: Fri, 26 May 2023 11:10:52 +0200 Subject: drm/nouveau: bring back blit subchannel for pre nv50 GPUs 1ba6113a90a0 removed a lot of the kernel GPU channel, but method 0x128 was important as otherwise the GPU spams us with `CACHE_ERROR` messages. We use the blit subchannel inside our vblank handling, so we should keep at least this part. v2: Only do it for NV11+ GPUs Closes: https://gitlab.freedesktop.org/drm/nouveau/-/issues/201 Fixes: 4a16dd9d18a0 ("drm/nouveau/kms: switch to drm fbdev helpers") Signed-off-by: Karol Herbst Reviewed-by: Ben Skeggs Link: https://patchwork.freedesktop.org/patch/msgid/20230526091052.2169044-1-kherbst@redhat.com --- drivers/gpu/drm/nouveau/nouveau_chan.c | 1 + drivers/gpu/drm/nouveau/nouveau_chan.h | 1 + drivers/gpu/drm/nouveau/nouveau_drm.c | 20 +++++++++++++++++--- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.c b/drivers/gpu/drm/nouveau/nouveau_chan.c index e648ecd0c1a0..3dfbc374478e 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.c +++ b/drivers/gpu/drm/nouveau/nouveau_chan.c @@ -90,6 +90,7 @@ nouveau_channel_del(struct nouveau_channel **pchan) if (cli) nouveau_svmm_part(chan->vmm->svmm, chan->inst); + nvif_object_dtor(&chan->blit); nvif_object_dtor(&chan->nvsw); nvif_object_dtor(&chan->gart); nvif_object_dtor(&chan->vram); diff --git a/drivers/gpu/drm/nouveau/nouveau_chan.h b/drivers/gpu/drm/nouveau/nouveau_chan.h index e06a8ffed31a..bad7466bd0d5 100644 --- a/drivers/gpu/drm/nouveau/nouveau_chan.h +++ b/drivers/gpu/drm/nouveau/nouveau_chan.h @@ -53,6 +53,7 @@ struct nouveau_channel { u32 user_put; struct nvif_object user; + struct nvif_object blit; struct nvif_event kill; atomic_t killed; diff --git a/drivers/gpu/drm/nouveau/nouveau_drm.c b/drivers/gpu/drm/nouveau/nouveau_drm.c index 7aac9384600e..40fb9a834918 100644 --- a/drivers/gpu/drm/nouveau/nouveau_drm.c +++ b/drivers/gpu/drm/nouveau/nouveau_drm.c @@ -375,15 +375,29 @@ nouveau_accel_gr_init(struct nouveau_drm *drm) ret = nvif_object_ctor(&drm->channel->user, "drmNvsw", NVDRM_NVSW, nouveau_abi16_swclass(drm), NULL, 0, &drm->channel->nvsw); + + if (ret == 0 && device->info.chipset >= 0x11) { + ret = nvif_object_ctor(&drm->channel->user, "drmBlit", + 0x005f, 0x009f, + NULL, 0, &drm->channel->blit); + } + if (ret == 0) { struct nvif_push *push = drm->channel->chan.push; - ret = PUSH_WAIT(push, 2); - if (ret == 0) + ret = PUSH_WAIT(push, 8); + if (ret == 0) { + if (device->info.chipset >= 0x11) { + PUSH_NVSQ(push, NV05F, 0x0000, drm->channel->blit.handle); + PUSH_NVSQ(push, NV09F, 0x0120, 0, + 0x0124, 1, + 0x0128, 2); + } PUSH_NVSQ(push, NV_SW, 0x0000, drm->channel->nvsw.handle); + } } if (ret) { - NV_ERROR(drm, "failed to allocate sw class, %d\n", ret); + NV_ERROR(drm, "failed to allocate sw or blit class, %d\n", ret); nouveau_accel_gr_fini(drm); return; } -- cgit v1.2.3 From 0d5ace1a07f7e846d0f6d972af60d05515599d0b Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 5 Jul 2023 08:30:02 -0500 Subject: pinctrl: amd: Only use special debounce behavior for GPIO 0 It's uncommon to use debounce on any other pin, but technically we should only set debounce to 0 when working off GPIO0. Cc: stable@vger.kernel.org Tested-by: Jan Visser Fixes: 968ab9261627 ("pinctrl: amd: Detect internal GPIO0 debounce handling") Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20230705133005.577-2-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 3c4220be30ec..00d5b517b745 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -128,9 +128,11 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, raw_spin_lock_irqsave(&gpio_dev->lock, flags); /* Use special handling for Pin0 debounce */ - pin_reg = readl(gpio_dev->base + WAKE_INT_MASTER_REG); - if (pin_reg & INTERNAL_GPIO0_DEBOUNCE) - debounce = 0; + if (offset == 0) { + pin_reg = readl(gpio_dev->base + WAKE_INT_MASTER_REG); + if (pin_reg & INTERNAL_GPIO0_DEBOUNCE) + debounce = 0; + } pin_reg = readl(gpio_dev->base + offset * 4); -- cgit v1.2.3 From 635a750d958e158e17af0f524bedc484b27fbb93 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 5 Jul 2023 08:30:03 -0500 Subject: pinctrl: amd: Use amd_pinconf_set() for all config options On ASUS TUF A16 it is reported that the ITE5570 ACPI device connected to GPIO 7 is causing an interrupt storm. This issue doesn't happen on Windows. Comparing the GPIO register configuration between Windows and Linux bit 20 has been configured as a pull up on Windows, but not on Linux. Checking GPIO declaration from the firmware it is clear it *should* have been a pull up on Linux as well. ``` GpioInt (Level, ActiveLow, Exclusive, PullUp, 0x0000, "\\_SB.GPIO", 0x00, ResourceConsumer, ,) { // Pin list 0x0007 } ``` On Linux amd_gpio_set_config() is currently only used for programming the debounce. Actually the GPIO core calls it with all the arguments that are supported by a GPIO, pinctrl-amd just responds `-ENOTSUPP`. To solve this issue expand amd_gpio_set_config() to support the other arguments amd_pinconf_set() supports, namely `PIN_CONFIG_BIAS_PULL_DOWN`, `PIN_CONFIG_BIAS_PULL_UP`, and `PIN_CONFIG_DRIVE_STRENGTH`. Reported-by: Nik P Reported-by: Nathan Schulte Reported-by: Friedrich Vock Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217336 Reported-by: dridri85@gmail.com Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217493 Link: https://lore.kernel.org/linux-input/20230530154058.17594-1-friedrich.vock@gmx.de/ Tested-by: Jan Visser Fixes: 2956b5d94a76 ("pinctrl / gpio: Introduce .set_config() callback for GPIO chips") Signed-off-by: Mario Limonciello Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230705133005.577-3-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index 00d5b517b745..c71efd19f139 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -189,18 +189,6 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, return ret; } -static int amd_gpio_set_config(struct gpio_chip *gc, unsigned offset, - unsigned long config) -{ - u32 debounce; - - if (pinconf_to_config_param(config) != PIN_CONFIG_INPUT_DEBOUNCE) - return -ENOTSUPP; - - debounce = pinconf_to_config_argument(config); - return amd_gpio_set_debounce(gc, offset, debounce); -} - #ifdef CONFIG_DEBUG_FS static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) { @@ -782,7 +770,7 @@ static int amd_pinconf_get(struct pinctrl_dev *pctldev, } static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, - unsigned long *configs, unsigned num_configs) + unsigned long *configs, unsigned int num_configs) { int i; u32 arg; @@ -872,6 +860,20 @@ static int amd_pinconf_group_set(struct pinctrl_dev *pctldev, return 0; } +static int amd_gpio_set_config(struct gpio_chip *gc, unsigned int pin, + unsigned long config) +{ + struct amd_gpio *gpio_dev = gpiochip_get_data(gc); + + if (pinconf_to_config_param(config) == PIN_CONFIG_INPUT_DEBOUNCE) { + u32 debounce = pinconf_to_config_argument(config); + + return amd_gpio_set_debounce(gc, pin, debounce); + } + + return amd_pinconf_set(gpio_dev->pctrl, pin, &config, 1); +} + static const struct pinconf_ops amd_pinconf_ops = { .pin_config_get = amd_pinconf_get, .pin_config_set = amd_pinconf_set, -- cgit v1.2.3 From 3f62312d04d4c68aace9cd06fc135e09573325f3 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 5 Jul 2023 08:30:04 -0500 Subject: pinctrl: amd: Drop pull up select configuration MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit pinctrl-amd currently tries to program bit 19 of all GPIOs to select either a 4kΩ or 8hΩ pull up, but this isn't what bit 19 does. Bit 19 is marked as reserved, even in the latest platforms documentation. Drop this programming functionality. Tested-by: Jan Visser Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20230705133005.577-4-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 16 ++++------------ drivers/pinctrl/pinctrl-amd.h | 1 - 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index c71efd19f139..b7eb78fab594 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -210,7 +210,6 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) char *pin_sts; char *interrupt_sts; char *wake_sts; - char *pull_up_sel; char *orientation; char debounce_value[40]; char *debounce_enable; @@ -318,14 +317,9 @@ static void amd_gpio_dbg_show(struct seq_file *s, struct gpio_chip *gc) seq_printf(s, " %s|", wake_sts); if (pin_reg & BIT(PULL_UP_ENABLE_OFF)) { - if (pin_reg & BIT(PULL_UP_SEL_OFF)) - pull_up_sel = "8k"; - else - pull_up_sel = "4k"; - seq_printf(s, "%s ↑|", - pull_up_sel); + seq_puts(s, " ↑ |"); } else if (pin_reg & BIT(PULL_DOWN_ENABLE_OFF)) { - seq_puts(s, " ↓|"); + seq_puts(s, " ↓ |"); } else { seq_puts(s, " |"); } @@ -751,7 +745,7 @@ static int amd_pinconf_get(struct pinctrl_dev *pctldev, break; case PIN_CONFIG_BIAS_PULL_UP: - arg = (pin_reg >> PULL_UP_SEL_OFF) & (BIT(0) | BIT(1)); + arg = (pin_reg >> PULL_UP_ENABLE_OFF) & BIT(0); break; case PIN_CONFIG_DRIVE_STRENGTH: @@ -798,10 +792,8 @@ static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, break; case PIN_CONFIG_BIAS_PULL_UP: - pin_reg &= ~BIT(PULL_UP_SEL_OFF); - pin_reg |= (arg & BIT(0)) << PULL_UP_SEL_OFF; pin_reg &= ~BIT(PULL_UP_ENABLE_OFF); - pin_reg |= ((arg>>1) & BIT(0)) << PULL_UP_ENABLE_OFF; + pin_reg |= (arg & BIT(0)) << PULL_UP_ENABLE_OFF; break; case PIN_CONFIG_DRIVE_STRENGTH: diff --git a/drivers/pinctrl/pinctrl-amd.h b/drivers/pinctrl/pinctrl-amd.h index 1cf2d06bbd8c..34c5c3e71fb2 100644 --- a/drivers/pinctrl/pinctrl-amd.h +++ b/drivers/pinctrl/pinctrl-amd.h @@ -36,7 +36,6 @@ #define WAKE_CNTRL_OFF_S4 15 #define PIN_STS_OFF 16 #define DRV_STRENGTH_SEL_OFF 17 -#define PULL_UP_SEL_OFF 19 #define PULL_UP_ENABLE_OFF 20 #define PULL_DOWN_ENABLE_OFF 21 #define OUTPUT_VALUE_OFF 22 -- cgit v1.2.3 From 283c5ce7da0a676f46539094d40067ad17c4f294 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 5 Jul 2023 08:30:05 -0500 Subject: pinctrl: amd: Unify debounce handling into amd_pinconf_set() Debounce handling is done in two different entry points in the driver. Unify this to make sure that it's always handled the same. Tested-by: Jan Visser Signed-off-by: Mario Limonciello Link: https://lore.kernel.org/r/20230705133005.577-5-mario.limonciello@amd.com Signed-off-by: Linus Walleij --- drivers/pinctrl/pinctrl-amd.c | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/drivers/pinctrl/pinctrl-amd.c b/drivers/pinctrl/pinctrl-amd.c index b7eb78fab594..4a8c1b57a90d 100644 --- a/drivers/pinctrl/pinctrl-amd.c +++ b/drivers/pinctrl/pinctrl-amd.c @@ -116,16 +116,12 @@ static void amd_gpio_set_value(struct gpio_chip *gc, unsigned offset, int value) raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); } -static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, - unsigned debounce) +static int amd_gpio_set_debounce(struct amd_gpio *gpio_dev, unsigned int offset, + unsigned int debounce) { u32 time; u32 pin_reg; int ret = 0; - unsigned long flags; - struct amd_gpio *gpio_dev = gpiochip_get_data(gc); - - raw_spin_lock_irqsave(&gpio_dev->lock, flags); /* Use special handling for Pin0 debounce */ if (offset == 0) { @@ -184,7 +180,6 @@ static int amd_gpio_set_debounce(struct gpio_chip *gc, unsigned offset, pin_reg &= ~(DB_CNTRl_MASK << DB_CNTRL_OFF); } writel(pin_reg, gpio_dev->base + offset * 4); - raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); return ret; } @@ -782,9 +777,8 @@ static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, switch (param) { case PIN_CONFIG_INPUT_DEBOUNCE: - pin_reg &= ~DB_TMR_OUT_MASK; - pin_reg |= arg & DB_TMR_OUT_MASK; - break; + ret = amd_gpio_set_debounce(gpio_dev, pin, arg); + goto out_unlock; case PIN_CONFIG_BIAS_PULL_DOWN: pin_reg &= ~BIT(PULL_DOWN_ENABLE_OFF); @@ -811,6 +805,7 @@ static int amd_pinconf_set(struct pinctrl_dev *pctldev, unsigned int pin, writel(pin_reg, gpio_dev->base + pin*4); } +out_unlock: raw_spin_unlock_irqrestore(&gpio_dev->lock, flags); return ret; @@ -857,12 +852,6 @@ static int amd_gpio_set_config(struct gpio_chip *gc, unsigned int pin, { struct amd_gpio *gpio_dev = gpiochip_get_data(gc); - if (pinconf_to_config_param(config) == PIN_CONFIG_INPUT_DEBOUNCE) { - u32 debounce = pinconf_to_config_argument(config); - - return amd_gpio_set_debounce(gc, pin, debounce); - } - return amd_pinconf_set(gpio_dev->pctrl, pin, &config, 1); } -- cgit v1.2.3 From 8cc32a9bbf2934d90762d9de0187adcb5ad46a11 Mon Sep 17 00:00:00 2001 From: Yonghong Song Date: Wed, 28 Jun 2023 11:19:26 -0700 Subject: kallsyms: strip LTO-only suffixes from promoted global functions Commit 6eb4bd92c1ce ("kallsyms: strip LTO suffixes from static functions") stripped all function/variable suffixes started with '.' regardless of whether those suffixes are generated at LTO mode or not. In fact, as far as I know, in LTO mode, when a static function/variable is promoted to the global scope, '.llvm.<...>' suffix is added. The existing mechanism breaks live patch for a LTO kernel even if no .llvm.<...> symbols are involved. For example, for the following kernel symbols: $ grep bpf_verifier_vlog /proc/kallsyms ffffffff81549f60 t bpf_verifier_vlog ffffffff8268b430 d bpf_verifier_vlog._entry ffffffff8282a958 d bpf_verifier_vlog._entry_ptr ffffffff82e12a1f d bpf_verifier_vlog.__already_done 'bpf_verifier_vlog' is a static function. '_entry', '_entry_ptr' and '__already_done' are static variables used inside 'bpf_verifier_vlog', so llvm promotes them to file-level static with prefix 'bpf_verifier_vlog.'. Note that the func-level to file-level static function promotion also happens without LTO. Given a symbol name 'bpf_verifier_vlog', with LTO kernel, current mechanism will return 4 symbols to live patch subsystem which current live patching subsystem cannot handle it. With non-LTO kernel, only one symbol is returned. In [1], we have a lengthy discussion, the suggestion is to separate two cases: (1). new symbols with suffix which are generated regardless of whether LTO is enabled or not, and (2). new symbols with suffix generated only when LTO is enabled. The cleanup_symbol_name() should only remove suffixes for case (2). Case (1) should not be changed so it can work uniformly with or without LTO. This patch removed LTO-only suffix '.llvm.<...>' so live patching and tracing should work the same way for non-LTO kernel. The cleanup_symbol_name() in scripts/kallsyms.c is also changed to have the same filtering pattern so both kernel and kallsyms tool have the same expectation on the order of symbols. [1] https://lore.kernel.org/live-patching/20230615170048.2382735-1-song@kernel.org/T/#u Fixes: 6eb4bd92c1ce ("kallsyms: strip LTO suffixes from static functions") Reported-by: Song Liu Signed-off-by: Yonghong Song Reviewed-by: Zhen Lei Reviewed-by: Nick Desaulniers Acked-by: Song Liu Link: https://lore.kernel.org/r/20230628181926.4102448-1-yhs@fb.com Signed-off-by: Kees Cook --- kernel/kallsyms.c | 5 ++--- scripts/kallsyms.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 7982cc9d497c..016d997131d4 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -174,11 +174,10 @@ static bool cleanup_symbol_name(char *s) * LLVM appends various suffixes for local functions and variables that * must be promoted to global scope as part of LTO. This can break * hooking of static functions with kprobes. '.' is not a valid - * character in an identifier in C. Suffixes observed: + * character in an identifier in C. Suffixes only in LLVM LTO observed: * - foo.llvm.[0-9a-f]+ - * - foo.[0-9a-f]+ */ - res = strchr(s, '.'); + res = strstr(s, ".llvm."); if (res) { *res = '\0'; return true; diff --git a/scripts/kallsyms.c b/scripts/kallsyms.c index d387c9381650..16c87938b316 100644 --- a/scripts/kallsyms.c +++ b/scripts/kallsyms.c @@ -349,10 +349,10 @@ static void cleanup_symbol_name(char *s) * ASCII[_] = 5f * ASCII[a-z] = 61,7a * - * As above, replacing '.' with '\0' does not affect the main sorting, - * but it helps us with subsorting. + * As above, replacing the first '.' in ".llvm." with '\0' does not + * affect the main sorting, but it helps us with subsorting. */ - p = strchr(s, '.'); + p = strstr(s, ".llvm."); if (p) *p = '\0'; } -- cgit v1.2.3 From f1f047bd7ce0d73788e04ac02268060a565f7ecb Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 11 Jul 2023 17:12:31 -0600 Subject: smb: client: Fix -Wstringop-overflow issues pSMB->hdr.Protocol is an array of size 4 bytes, hence when the compiler analyzes this line of code parm_data = ((char *) &pSMB->hdr.Protocol) + offset; it legitimately complains about the fact that offset points outside the bounds of the array. Notice that the compiler gives priority to the object as an array, rather than merely the address of one more byte in a structure to wich offset should be added (which seems to be the actual intention of the original implementation). Fix this by explicitly instructing the compiler to treat the code as a sequence of bytes in struct smb_com_transaction2_spi_req, and not as an array accessed through pointer notation. Notice that ((char *)pSMB) + sizeof(pSMB->hdr.smb_buf_length) points to the same address as ((char *) &pSMB->hdr.Protocol), therefore this results in no differences in binary output. Fixes the following -Wstringop-overflow warnings when built s390 architecture with defconfig (GCC 13): CC [M] fs/smb/client/cifssmb.o In function 'cifs_init_ace', inlined from 'posix_acl_to_cifs' at fs/smb/client/cifssmb.c:3046:3, inlined from 'cifs_do_set_acl' at fs/smb/client/cifssmb.c:3191:15: fs/smb/client/cifssmb.c:2987:31: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] 2987 | cifs_ace->cifs_e_perm = local_ace->e_perm; | ~~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~ In file included from fs/smb/client/cifssmb.c:27: fs/smb/client/cifspdu.h: In function 'cifs_do_set_acl': fs/smb/client/cifspdu.h:384:14: note: at offset [7, 11] into destination object 'Protocol' of size 4 384 | __u8 Protocol[4]; | ^~~~~~~~ In function 'cifs_init_ace', inlined from 'posix_acl_to_cifs' at fs/smb/client/cifssmb.c:3046:3, inlined from 'cifs_do_set_acl' at fs/smb/client/cifssmb.c:3191:15: fs/smb/client/cifssmb.c:2988:30: warning: writing 1 byte into a region of size 0 [-Wstringop-overflow=] 2988 | cifs_ace->cifs_e_tag = local_ace->e_tag; | ~~~~~~~~~~~~~~~~~~~~~^~~~~~~~~~~~~~~~~~~ fs/smb/client/cifspdu.h: In function 'cifs_do_set_acl': fs/smb/client/cifspdu.h:384:14: note: at offset [6, 10] into destination object 'Protocol' of size 4 384 | __u8 Protocol[4]; | ^~~~~~~~ This helps with the ongoing efforts to globally enable -Wstringop-overflow. Link: https://github.com/KSPP/linux/issues/310 Fixes: dc1af4c4b472 ("cifs: implement set acl method") Cc: stable@vger.kernel.org Signed-off-by: Gustavo A. R. Silva Reviewed-by: Kees Cook Signed-off-by: Steve French --- fs/smb/client/cifssmb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 19f7385abeec..9dee267f1893 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -3184,7 +3184,7 @@ setAclRetry: param_offset = offsetof(struct smb_com_transaction2_spi_req, InformationLevel) - 4; offset = param_offset + params; - parm_data = ((char *) &pSMB->hdr.Protocol) + offset; + parm_data = ((char *)pSMB) + sizeof(pSMB->hdr.smb_buf_length) + offset; pSMB->ParameterOffset = cpu_to_le16(param_offset); /* convert to on the wire format for POSIX ACL */ -- cgit v1.2.3 From bf99f6be2d20146942bce6f9e90a0ceef12cbc1e Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Tue, 11 Jul 2023 14:15:10 -0300 Subject: smb: client: fix missed ses refcounting Use new cifs_smb_ses_inc_refcount() helper to get an active reference of @ses and @ses->dfs_root_ses (if set). This will prevent @ses->dfs_root_ses of being put in the next call to cifs_put_smb_ses() and thus potentially causing an use-after-free bug. Fixes: 8e3554150d6c ("cifs: fix sharing of DFS connections") Signed-off-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/dfs.c | 26 ++++++++++---------------- fs/smb/client/smb2transport.c | 2 +- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/smb/client/dfs.c b/fs/smb/client/dfs.c index 1403a2d1ab17..df3fd3b720da 100644 --- a/fs/smb/client/dfs.c +++ b/fs/smb/client/dfs.c @@ -66,6 +66,12 @@ static int get_session(struct cifs_mount_ctx *mnt_ctx, const char *full_path) return rc; } +/* + * Track individual DFS referral servers used by new DFS mount. + * + * On success, their lifetime will be shared by final tcon (dfs_ses_list). + * Otherwise, they will be put by dfs_put_root_smb_sessions() in cifs_mount(). + */ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; @@ -80,11 +86,12 @@ static int add_root_smb_session(struct cifs_mount_ctx *mnt_ctx) INIT_LIST_HEAD(&root_ses->list); spin_lock(&cifs_tcp_ses_lock); - ses->ses_count++; + cifs_smb_ses_inc_refcount(ses); spin_unlock(&cifs_tcp_ses_lock); root_ses->ses = ses; list_add_tail(&root_ses->list, &mnt_ctx->dfs_ses_list); } + /* Select new DFS referral server so that new referrals go through it */ ctx->dfs_root_ses = ses; return 0; } @@ -242,7 +249,6 @@ out: int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) { struct smb3_fs_context *ctx = mnt_ctx->fs_ctx; - struct cifs_ses *ses; bool nodfs = ctx->nodfs; int rc; @@ -276,20 +282,8 @@ int dfs_mount_share(struct cifs_mount_ctx *mnt_ctx, bool *isdfs) } *isdfs = true; - /* - * Prevent DFS root session of being put in the first call to - * cifs_mount_put_conns(). If another DFS root server was not found - * while chasing the referrals (@ctx->dfs_root_ses == @ses), then we - * can safely put extra refcount of @ses. - */ - ses = mnt_ctx->ses; - mnt_ctx->ses = NULL; - mnt_ctx->server = NULL; - rc = __dfs_mount_share(mnt_ctx); - if (ses == ctx->dfs_root_ses) - cifs_put_smb_ses(ses); - - return rc; + add_root_smb_session(mnt_ctx); + return __dfs_mount_share(mnt_ctx); } /* Update dfs referral path of superblock */ diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index c6db898dab7c..7676091b3e77 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -160,7 +160,7 @@ smb2_find_smb_ses_unlocked(struct TCP_Server_Info *server, __u64 ses_id) spin_unlock(&ses->ses_lock); continue; } - ++ses->ses_count; + cifs_smb_ses_inc_refcount(ses); spin_unlock(&ses->ses_lock); return ses; } -- cgit v1.2.3 From a282a2f10539dce2aa619e71e1817570d557fc97 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 10 Jul 2023 20:39:29 +0200 Subject: libceph: harden msgr2.1 frame segment length checks ceph_frame_desc::fd_lens is an int array. decode_preamble() thus effectively casts u32 -> int but the checks for segment lengths are written as if on unsigned values. While reading in HELLO or one of the AUTH frames (before authentication is completed), arithmetic in head_onwire_len() can get duped by negative ctrl_len and produce head_len which is less than CEPH_PREAMBLE_LEN but still positive. This would lead to a buffer overrun in prepare_read_control() as the preamble gets copied to the newly allocated buffer of size head_len. Cc: stable@vger.kernel.org Fixes: cd1a677cad99 ("libceph, ceph: implement msgr2.1 protocol (crc and secure modes)") Reported-by: Thelford Williams Signed-off-by: Ilya Dryomov Reviewed-by: Xiubo Li --- net/ceph/messenger_v2.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index 1a888b86a494..1df1d29dee92 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -390,6 +390,8 @@ static int head_onwire_len(int ctrl_len, bool secure) int head_len; int rem_len; + BUG_ON(ctrl_len < 0 || ctrl_len > CEPH_MSG_MAX_CONTROL_LEN); + if (secure) { head_len = CEPH_PREAMBLE_SECURE_LEN; if (ctrl_len > CEPH_PREAMBLE_INLINE_LEN) { @@ -408,6 +410,10 @@ static int head_onwire_len(int ctrl_len, bool secure) static int __tail_onwire_len(int front_len, int middle_len, int data_len, bool secure) { + BUG_ON(front_len < 0 || front_len > CEPH_MSG_MAX_FRONT_LEN || + middle_len < 0 || middle_len > CEPH_MSG_MAX_MIDDLE_LEN || + data_len < 0 || data_len > CEPH_MSG_MAX_DATA_LEN); + if (!front_len && !middle_len && !data_len) return 0; @@ -520,29 +526,34 @@ static int decode_preamble(void *p, struct ceph_frame_desc *desc) desc->fd_aligns[i] = ceph_decode_16(&p); } - /* - * This would fire for FRAME_TAG_WAIT (it has one empty - * segment), but we should never get it as client. - */ - if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { - pr_err("last segment empty\n"); + if (desc->fd_lens[0] < 0 || + desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { + pr_err("bad control segment length %d\n", desc->fd_lens[0]); return -EINVAL; } - - if (desc->fd_lens[0] > CEPH_MSG_MAX_CONTROL_LEN) { - pr_err("control segment too big %d\n", desc->fd_lens[0]); + if (desc->fd_lens[1] < 0 || + desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { + pr_err("bad front segment length %d\n", desc->fd_lens[1]); return -EINVAL; } - if (desc->fd_lens[1] > CEPH_MSG_MAX_FRONT_LEN) { - pr_err("front segment too big %d\n", desc->fd_lens[1]); + if (desc->fd_lens[2] < 0 || + desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) { + pr_err("bad middle segment length %d\n", desc->fd_lens[2]); return -EINVAL; } - if (desc->fd_lens[2] > CEPH_MSG_MAX_MIDDLE_LEN) { - pr_err("middle segment too big %d\n", desc->fd_lens[2]); + if (desc->fd_lens[3] < 0 || + desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) { + pr_err("bad data segment length %d\n", desc->fd_lens[3]); return -EINVAL; } - if (desc->fd_lens[3] > CEPH_MSG_MAX_DATA_LEN) { - pr_err("data segment too big %d\n", desc->fd_lens[3]); + + /* + * This would fire for FRAME_TAG_WAIT (it has one empty + * segment), but we should never get it as client. + */ + if (!desc->fd_lens[desc->fd_seg_cnt - 1]) { + pr_err("last segment empty, segment count %d\n", + desc->fd_seg_cnt); return -EINVAL; } -- cgit v1.2.3 From ac522fc6c3165fd0daa2f8da7e07d5f800586daa Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 13 Jul 2023 15:30:42 +0200 Subject: nvme: don't reject probe due to duplicate IDs for single-ported PCIe devices While duplicate IDs are still very harmful, including the potential to easily see changing devices in /dev/disk/by-id, it turn out they are extremely common for cheap end user NVMe devices. Relax our check for them for so that it doesn't reject the probe on single-ported PCIe devices, but prints a big warning instead. In doubt we'd still like to see quirk entries to disable the potential for changing supposed stable device identifier links, but this will at least allow users how have two (or more) of these devices to use them without having to manually add a new PCI ID entry with the quirk through sysfs or by patching the kernel. Fixes: 2079f41ec6ff ("nvme: check that EUI/GUID/UUID are globally unique") Cc: stable@vger.kernel.org # 6.0+ Co-developed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/core.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 47d7ba2827ff..37b6fa746662 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3431,10 +3431,40 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info) ret = nvme_global_check_duplicate_ids(ctrl->subsys, &info->ids); if (ret) { - dev_err(ctrl->device, - "globally duplicate IDs for nsid %d\n", info->nsid); + /* + * We've found two different namespaces on two different + * subsystems that report the same ID. This is pretty nasty + * for anything that actually requires unique device + * identification. In the kernel we need this for multipathing, + * and in user space the /dev/disk/by-id/ links rely on it. + * + * If the device also claims to be multi-path capable back off + * here now and refuse the probe the second device as this is a + * recipe for data corruption. If not this is probably a + * cheap consumer device if on the PCIe bus, so let the user + * proceed and use the shiny toy, but warn that with changing + * probing order (which due to our async probing could just be + * device taking longer to startup) the other device could show + * up at any time. + */ nvme_print_device_info(ctrl); - return ret; + if ((ns->ctrl->ops->flags & NVME_F_FABRICS) || /* !PCIe */ + ((ns->ctrl->subsys->cmic & NVME_CTRL_CMIC_MULTI_CTRL) && + info->is_shared)) { + dev_err(ctrl->device, + "ignoring nsid %d because of duplicate IDs\n", + info->nsid); + return ret; + } + + dev_err(ctrl->device, + "clearing duplicate IDs for nsid %d\n", info->nsid); + dev_err(ctrl->device, + "use of /dev/disk/by-id/ may cause data corruption\n"); + memset(&info->ids.nguid, 0, sizeof(info->ids.nguid)); + memset(&info->ids.uuid, 0, sizeof(info->ids.uuid)); + memset(&info->ids.eui64, 0, sizeof(info->ids.eui64)); + ctrl->quirks |= NVME_QUIRK_BOGUS_NID; } mutex_lock(&ctrl->subsys->lock); -- cgit v1.2.3 From b8f6446b6853768cb99e7c201bddce69ca60c15e Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 13 Jul 2023 17:26:20 +0800 Subject: nvme-pci: fix DMA direction of unmapping integrity data DMA direction should be taken in dma_unmap_page() for unmapping integrity data. Fix this DMA direction, and reported in Guangwu's test. Reported-by: Guangwu Zhang Fixes: 4aedb705437f ("nvme-pci: split metadata handling from nvme_map_data / nvme_unmap_data") Signed-off-by: Ming Lei Reviewed-by: Christoph Hellwig Signed-off-by: Keith Busch --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 8e7dbe0ab890..baf69af7ea78 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -967,7 +967,7 @@ static __always_inline void nvme_pci_unmap_rq(struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); dma_unmap_page(dev->dev, iod->meta_dma, - rq_integrity_vec(req)->bv_len, rq_data_dir(req)); + rq_integrity_vec(req)->bv_len, rq_dma_dir(req)); } if (blk_rq_nr_phys_segments(req)) -- cgit v1.2.3 From d5f28bb1ce04636b285726ee2a5afa3a514025f4 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 8 Jul 2023 01:38:03 +0900 Subject: fprobes: Add a comment why fprobe_kprobe_handler exits if kprobe is running Add a comment the reason why fprobe_kprobe_handler() exits if any other kprobe is running. Link: https://lore.kernel.org/all/168874788299.159442.2485957441413653858.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230706120916.3c6abf15@gandalf.local.home/ Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/fprobe.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/fprobe.c b/kernel/trace/fprobe.c index 2571f7f3d5f2..59321d22f43e 100644 --- a/kernel/trace/fprobe.c +++ b/kernel/trace/fprobe.c @@ -100,6 +100,12 @@ static void fprobe_kprobe_handler(unsigned long ip, unsigned long parent_ip, return; } + /* + * This user handler is shared with other kprobes and is not expected to be + * called recursively. So if any other kprobe handler is running, this will + * exit as kprobe does. See the section 'Share the callbacks with kprobes' + * in Documentation/trace/fprobe.rst for more information. + */ if (unlikely(kprobe_running())) { fp->nmissed++; goto recursion_unlock; -- cgit v1.2.3 From 66bcf65d6cf0ca6540e2341e88ee7ef02dbdda08 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:29 +0900 Subject: tracing/probes: Fix to avoid double count of the string length on the array If an array is specified with the ustring or symstr, the length of the strings are accumlated on both of 'ret' and 'total', which means the length is double counted. Just set the length to the 'ret' value for avoiding double counting. Link: https://lore.kernel.org/all/168908492917.123124.15076463491122036025.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 88903c464321 ("tracing/probe: Add ustring type for user-space string") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 00707630788d..4735c5cb76fa 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -156,11 +156,11 @@ stage3: code++; goto array; case FETCH_OP_ST_USTRING: - ret += fetch_store_strlen_user(val + code->offset); + ret = fetch_store_strlen_user(val + code->offset); code++; goto array; case FETCH_OP_ST_SYMSTR: - ret += fetch_store_symstrlen(val + code->offset); + ret = fetch_store_symstrlen(val + code->offset); code++; goto array; default: -- cgit v1.2.3 From b41326b5e0f82e93592c4366359917b5d67b529f Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:38 +0900 Subject: tracing/probes: Fix not to count error code to total length Fix not to count the error code (which is minus value) to the total used length of array, because it can mess up the return code of process_fetch_insn_bottom(). Also clear the 'ret' value because it will be used for calculating next data_loc entry. Link: https://lore.kernel.org/all/168908493827.123124.2175257289106364229.stgit@devnote2/ Reported-by: Dan Carpenter Closes: https://lore.kernel.org/all/8819b154-2ba1-43c3-98a2-cbde20892023@moroto.mountain/ Fixes: 9b960a38835f ("tracing: probeevent: Unify fetch_insn processing common part") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 4735c5cb76fa..ed9d57c6b041 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -204,6 +204,8 @@ stage3: array: /* the last stage: Loop on array */ if (code->op == FETCH_OP_LP_ARRAY) { + if (ret < 0) + ret = 0; total += ret; if (++i < code->param) { code = s3; -- cgit v1.2.3 From e38e2c6a9efc435f9de344b7c91f7697e01b47d5 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:48 +0900 Subject: tracing/probes: Fix to update dynamic data counter if fetcharg uses it Fix to update dynamic data counter ('dyndata') and max length ('maxlen') only if the fetcharg uses the dynamic data. Also get out arg->dynamic from unlikely(). This makes dynamic data address wrong if process_fetch_insn() returns error on !arg->dynamic case. Link: https://lore.kernel.org/all/168908494781.123124.8160245359962103684.stgit@devnote2/ Suggested-by: Steven Rostedt Link: https://lore.kernel.org/all/20230710233400.5aaf024e@gandalf.local.home/ Fixes: 9178412ddf5a ("tracing: probeevent: Return consumed bytes of dynamic area") Cc: stable@vger.kernel.org Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace_probe_tmpl.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index ed9d57c6b041..185da001f4c3 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,11 +267,13 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (unlikely(ret < 0 && arg->dynamic)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; + if (arg->dynamic) { + if (unlikely(ret < 0)) { + *dl = make_data_loc(0, dyndata - base); + } else { + dyndata += ret; + maxlen -= ret; + } } } } -- cgit v1.2.3 From 4ed8f337dee32df71435689c19d22e4ee846e15a Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:15:57 +0900 Subject: Revert "tracing: Add "(fault)" name injection to kernel probes" This reverts commit 2e9906f84fc7c99388bb7123ade167250d50f1c0. It was turned out that commit 2e9906f84fc7 ("tracing: Add "(fault)" name injection to kernel probes") did not work correctly and probe events still show just '(fault)' (instead of '"(fault)"'). Also, current '(fault)' is more explicit that it faulted. This also moves FAULT_STRING macro to trace.h so that synthetic event can keep using it, and uses it in trace_probe.c too. Link: https://lore.kernel.org/all/168908495772.123124.1250788051922100079.stgit@devnote2/ Link: https://lore.kernel.org/all/20230706230642.3793a593@rorschach.local.home/ Cc: stable@vger.kernel.org Cc: Andrew Morton Cc: Tom Zanussi Signed-off-by: Masami Hiramatsu (Google) Reviewed-by: Steven Rostedt (Google) --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_probe.c | 2 +- kernel/trace/trace_probe_kernel.h | 31 ++++++------------------------- 3 files changed, 9 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 79bdefe9261b..eee1f3ca4749 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -113,6 +113,8 @@ enum trace_type { #define MEM_FAIL(condition, fmt, ...) \ DO_ONCE_LITE_IF(condition, pr_err, "ERROR: " fmt, ##__VA_ARGS__) +#define FAULT_STRING "(fault)" + #define HIST_STACKTRACE_DEPTH 16 #define HIST_STACKTRACE_SIZE (HIST_STACKTRACE_DEPTH * sizeof(unsigned long)) #define HIST_STACKTRACE_SKIP 5 diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 2d2616678295..591399ddcee5 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -65,7 +65,7 @@ int PRINT_TYPE_FUNC_NAME(string)(struct trace_seq *s, void *data, void *ent) int len = *(u32 *)data >> 16; if (!len) - trace_seq_puts(s, "(fault)"); + trace_seq_puts(s, FAULT_STRING); else trace_seq_printf(s, "\"%s\"", (const char *)get_loc_data(data, ent)); diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index c4e1d4c03a85..6deae2ce34f8 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -2,8 +2,6 @@ #ifndef __TRACE_PROBE_KERNEL_H_ #define __TRACE_PROBE_KERNEL_H_ -#define FAULT_STRING "(fault)" - /* * This depends on trace_probe.h, but can not include it due to * the way trace_probe_tmpl.h is used by trace_kprobe.c and trace_eprobe.c. @@ -15,16 +13,8 @@ static nokprobe_inline int fetch_store_strlen_user(unsigned long addr) { const void __user *uaddr = (__force const void __user *)addr; - int ret; - ret = strnlen_user_nofault(uaddr, MAX_STRING_SIZE); - /* - * strnlen_user_nofault returns zero on fault, insert the - * FAULT_STRING when that occurs. - */ - if (ret <= 0) - return strlen(FAULT_STRING) + 1; - return ret; + return strnlen_user_nofault(uaddr, MAX_STRING_SIZE); } /* Return the length of string -- including null terminal byte */ @@ -44,18 +34,7 @@ fetch_store_strlen(unsigned long addr) len++; } while (c && ret == 0 && len < MAX_STRING_SIZE); - /* For faults, return enough to hold the FAULT_STRING */ - return (ret < 0) ? strlen(FAULT_STRING) + 1 : len; -} - -static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base, int len) -{ - if (ret >= 0) { - *(u32 *)dest = make_data_loc(ret, __dest - base); - } else { - strscpy(__dest, FAULT_STRING, len); - ret = strlen(__dest) + 1; - } + return (ret < 0) ? ret : len; } /* @@ -76,7 +55,8 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } @@ -107,7 +87,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - set_data_loc(ret, dest, __dest, base, maxlen); + if (ret >= 0) + *(u32 *)dest = make_data_loc(ret, __dest - base); return ret; } -- cgit v1.2.3 From e0b7b2081233ac7fe55838ff68cbc7ca9887a91f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Jul 2023 12:46:29 -0700 Subject: MAINTAINERS: Foolishly claim maintainership of string routines Since the string API is tightly coupled with FORTIFY_SOURCE, I am offering myself up as maintainer for it. Thankfully Andy is already a reviewer and can keep me on the straight and narrow. Acked-by: Andy Shevchenko Link: https://lore.kernel.org/r/20230712194625.never.252-kees@kernel.org Signed-off-by: Kees Cook --- MAINTAINERS | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 3be1bdfe8ecc..b639d6123856 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8672,8 +8672,11 @@ S: Maintained F: drivers/input/touchscreen/resistive-adc-touch.c GENERIC STRING LIBRARY +M: Kees Cook R: Andy Shevchenko -S: Maintained +L: linux-hardening@vger.kernel.org +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/hardening F: include/linux/string.h F: include/linux/string_choices.h F: include/linux/string_helpers.h -- cgit v1.2.3 From ec7633de404e7ce704d8f79081b97bca5b616c23 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Jun 2023 11:49:18 +0200 Subject: sparc: mark __arch_xchg() as __always_inline An otherwise correct change to the atomic operations uncovered an existing bug in the sparc __arch_xchg() function, which is calls __xchg_called_with_bad_pointer() when its arguments are unknown at compile time: ERROR: modpost: "__xchg_called_with_bad_pointer" [lib/atomic64_test.ko] undefined! This now happens because gcc determines that it's better to not inline the function. Avoid this by just marking the function as __always_inline to force the compiler to do the right thing here. Reported-by: Guenter Roeck Link: https://lore.kernel.org/all/c525adc9-6623-4660-8718-e0c9311563b8@roeck-us.net/ Fixes: d12157efc8e08 ("locking/atomic: make atomic*_{cmp,}xchg optional") Signed-off-by: Arnd Bergmann Acked-by: Palmer Dabbelt Acked-by: Mark Rutland Reviewed-by: Sam Ravnborg Acked-by: Guenter Roeck Acked-by: Andi Shyti Link: https://lore.kernel.org/r/20230628094938.2318171-1-arnd@kernel.org Signed-off-by: Kees Cook --- arch/sparc/include/asm/cmpxchg_32.h | 2 +- arch/sparc/include/asm/cmpxchg_64.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h index 7a1339533d1d..d0af82c240b7 100644 --- a/arch/sparc/include/asm/cmpxchg_32.h +++ b/arch/sparc/include/asm/cmpxchg_32.h @@ -15,7 +15,7 @@ unsigned long __xchg_u32(volatile u32 *m, u32 new); void __xchg_called_with_bad_pointer(void); -static inline unsigned long __arch_xchg(unsigned long x, __volatile__ void * ptr, int size) +static __always_inline unsigned long __arch_xchg(unsigned long x, __volatile__ void * ptr, int size) { switch (size) { case 4: diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h index 66cd61dde9ec..3de25262c411 100644 --- a/arch/sparc/include/asm/cmpxchg_64.h +++ b/arch/sparc/include/asm/cmpxchg_64.h @@ -87,7 +87,7 @@ xchg16(__volatile__ unsigned short *m, unsigned short val) return (load32 & mask) >> bit_shift; } -static inline unsigned long +static __always_inline unsigned long __arch_xchg(unsigned long x, __volatile__ void * ptr, int size) { switch (size) { -- cgit v1.2.3 From 5c17f45e91f5035c1b317e93b3dfb01088ac2902 Mon Sep 17 00:00:00 2001 From: Chengming Zhou Date: Mon, 10 Jul 2023 18:55:16 +0800 Subject: blk-mq: fix start_time_ns and alloc_time_ns for pre-allocated rq The iocost rely on rq start_time_ns and alloc_time_ns to tell saturation state of the block device. Most of the time request is allocated after rq_qos_throttle() and its alloc_time_ns or start_time_ns won't be affected. But for plug batched allocation introduced by the commit 47c122e35d7e ("block: pre-allocate requests if plug is started and is a batch"), we can rq_qos_throttle() after the allocation of the request. This is what the blk_mq_get_cached_request() does. In this case, the cached request alloc_time_ns or start_time_ns is much ahead if blocked in any qos ->throttle(). Fix it by setting alloc_time_ns and start_time_ns to now when the allocated request is actually used. Signed-off-by: Chengming Zhou Acked-by: Tejun Heo Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230710105516.2053478-1-chengming.zhou@linux.dev Signed-off-by: Jens Axboe --- block/blk-mq.c | 47 ++++++++++++++++++++++++++++++----------------- 1 file changed, 30 insertions(+), 17 deletions(-) diff --git a/block/blk-mq.c b/block/blk-mq.c index 5504719b970d..d50b1d62a3d9 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -328,8 +328,24 @@ void blk_rq_init(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_rq_init); +/* Set start and alloc time when the allocated request is actually used */ +static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) +{ + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; + +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + if (blk_queue_rq_alloc_time(rq->q)) + rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + else + rq->alloc_time_ns = 0; +#endif +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; @@ -356,14 +372,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } rq->timeout = 0; - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); - else - rq->start_time_ns = 0; rq->part = NULL; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - rq->alloc_time_ns = alloc_time_ns; -#endif rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; @@ -393,8 +402,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, } static inline struct request * -__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, - u64 alloc_time_ns) +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; @@ -413,7 +421,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); + rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add(data->cached_rq, rq); nr++; } @@ -474,9 +482,11 @@ retry: * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { - rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); - if (rq) + rq = __blk_mq_alloc_requests_batch(data); + if (rq) { + blk_mq_rq_time_init(rq, alloc_time_ns); return rq; + } data->nr_tags = 1; } @@ -499,8 +509,9 @@ retry: goto retry; } - return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, - alloc_time_ns); + rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); + blk_mq_rq_time_init(rq, alloc_time_ns); + return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, @@ -555,6 +566,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, return NULL; plug->cached_rq = rq_list_next(rq); + blk_mq_rq_time_init(rq, 0); } rq->cmd_flags = opf; @@ -656,8 +668,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, - alloc_time_ns); + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); + blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -2896,6 +2908,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, plug->cached_rq = rq_list_next(rq); rq_qos_throttle(q, *bio); + blk_mq_rq_time_init(rq, 0); rq->cmd_flags = (*bio)->bi_opf; INIT_LIST_HEAD(&rq->queuelist); return rq; -- cgit v1.2.3 From b33b8731566d50aae6e11bd02921452cf3a7e0e7 Mon Sep 17 00:00:00 2001 From: Ricardo Cañuelo Date: Thu, 29 Jun 2023 10:35:46 +0200 Subject: selftests/mincore: fix skip condition for check_huge_pages test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The check_huge_pages test was failing instead of skipping on qemu-armv7 because the skip condition wasn't handled properly. Add an additional check to fix it. Signed-off-by: Ricardo Cañuelo Reported-by: Naresh Kamboju Reported-by: Linux Kernel Functional Testing Reviewed-by: Muhammad Usama Anjum Tested-by: Anders Roxell Closes: https://lore.kernel.org/all/CA+G9fYuoB8Ug8PcTU-YGmemL7_eeEksXFihvxWF6OikD7sK7pA@mail.gmail.com Signed-off-by: Shuah Khan --- tools/testing/selftests/mincore/mincore_selftest.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mincore/mincore_selftest.c b/tools/testing/selftests/mincore/mincore_selftest.c index 4c88238fc8f0..e949a43a6145 100644 --- a/tools/testing/selftests/mincore/mincore_selftest.c +++ b/tools/testing/selftests/mincore/mincore_selftest.c @@ -150,8 +150,8 @@ TEST(check_huge_pages) MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (addr == MAP_FAILED) { - if (errno == ENOMEM) - SKIP(return, "No huge pages available."); + if (errno == ENOMEM || errno == EINVAL) + SKIP(return, "No huge pages available or CONFIG_HUGETLB_PAGE disabled."); else TH_LOG("mmap error: %s", strerror(errno)); } -- cgit v1.2.3 From e8b03aef194c3c8597f4fb8e58ade1cc1f43001e Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Thu, 6 Jul 2023 11:22:57 +0800 Subject: tools: timers: fix freq average calculation Delete a duplicate assignment from this function implementation. The note means ppm is average of the two actual freq samples. But ppm have a duplicate assignment. Signed-off-by: Minjie Du Acked-by: John Stultz Signed-off-by: Shuah Khan --- tools/testing/selftests/timers/raw_skew.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/testing/selftests/timers/raw_skew.c b/tools/testing/selftests/timers/raw_skew.c index 5beceeed0d11..6eba203f9da7 100644 --- a/tools/testing/selftests/timers/raw_skew.c +++ b/tools/testing/selftests/timers/raw_skew.c @@ -129,8 +129,7 @@ int main(int argc, char **argv) printf("%lld.%i(est)", eppm/1000, abs((int)(eppm%1000))); /* Avg the two actual freq samples adjtimex gave us */ - ppm = (tx1.freq + tx2.freq) * 1000 / 2; - ppm = (long long)tx1.freq * 1000; + ppm = (long long)(tx1.freq + tx2.freq) * 1000 / 2; ppm = shift_right(ppm, 16); printf(" %lld.%i(act)", ppm/1000, abs((int)(ppm%1000))); -- cgit v1.2.3 From 95ce158b6c93b28842b54b42ad1cb221b9844062 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Thu, 13 Jul 2023 00:34:05 +0200 Subject: dsa: mv88e6xxx: Do a final check before timing out I get sporadic timeouts from the driver when using the MV88E6352. Reading the status again after the loop fixes the problem: the operation is successful but goes undetected. Some added prints show things like this: [ 58.356209] mv88e6085 mdio_mux-0.1:00: Timeout while waiting for switch, addr 1b reg 0b, mask 8000, val 0000, data c000 [ 58.367487] mv88e6085 mdio_mux-0.1:00: Timeout waiting for ATU op 4000, fid 0001 (...) [ 61.826293] mv88e6085 mdio_mux-0.1:00: Timeout while waiting for switch, addr 1c reg 18, mask 8000, val 0000, data 9860 [ 61.837560] mv88e6085 mdio_mux-0.1:00: Timeout waiting for PHY command 1860 to complete The reason is probably not the commands: I think those are mostly fine with the 50+50ms timeout, but the problem appears when OpenWrt brings up several interfaces in parallel on a system with 7 populated ports: if one of them take more than 50 ms and waits one or more of the others can get stuck on the mutex for the switch and then this can easily multiply. As we sleep and wait, the function loop needs a final check after exiting the loop if we were successful. Suggested-by: Andrew Lunn Cc: Tobias Waldekranz Fixes: 35da1dfd9484 ("net: dsa: mv88e6xxx: Improve performance of busy bit polling") Signed-off-by: Linus Walleij Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20230712223405.861899-1-linus.walleij@linaro.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/chip.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/dsa/mv88e6xxx/chip.c b/drivers/net/dsa/mv88e6xxx/chip.c index 8b51756bd805..c7d51a539451 100644 --- a/drivers/net/dsa/mv88e6xxx/chip.c +++ b/drivers/net/dsa/mv88e6xxx/chip.c @@ -109,6 +109,13 @@ int mv88e6xxx_wait_mask(struct mv88e6xxx_chip *chip, int addr, int reg, usleep_range(1000, 2000); } + err = mv88e6xxx_read(chip, addr, reg, &data); + if (err) + return err; + + if ((data & mask) == val) + return 0; + dev_err(chip->dev, "Timeout while waiting for switch\n"); return -ETIMEDOUT; } -- cgit v1.2.3 From 5e1627cb43ddf1b24b92eb26f8d958a3f5676ccb Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 12 Jul 2023 10:15:10 -0400 Subject: net: usbnet: Fix WARNING in usbnet_start_xmit/usb_submit_urb The syzbot fuzzer identified a problem in the usbnet driver: usb 1-1: BOGUS urb xfer, pipe 3 != type 1 WARNING: CPU: 0 PID: 754 at drivers/usb/core/urb.c:504 usb_submit_urb+0xed6/0x1880 drivers/usb/core/urb.c:504 Modules linked in: CPU: 0 PID: 754 Comm: kworker/0:2 Not tainted 6.4.0-rc7-syzkaller-00014-g692b7dc87ca6 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 05/27/2023 Workqueue: mld mld_ifc_work RIP: 0010:usb_submit_urb+0xed6/0x1880 drivers/usb/core/urb.c:504 Code: 7c 24 18 e8 2c b4 5b fb 48 8b 7c 24 18 e8 42 07 f0 fe 41 89 d8 44 89 e1 4c 89 ea 48 89 c6 48 c7 c7 a0 c9 fc 8a e8 5a 6f 23 fb <0f> 0b e9 58 f8 ff ff e8 fe b3 5b fb 48 81 c5 c0 05 00 00 e9 84 f7 RSP: 0018:ffffc9000463f568 EFLAGS: 00010086 RAX: 0000000000000000 RBX: 0000000000000001 RCX: 0000000000000000 RDX: ffff88801eb28000 RSI: ffffffff814c03b7 RDI: 0000000000000001 RBP: ffff8881443b7190 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000001 R12: 0000000000000003 R13: ffff88802a77cb18 R14: 0000000000000003 R15: ffff888018262500 FS: 0000000000000000(0000) GS:ffff8880b9800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000556a99c15a18 CR3: 0000000028c71000 CR4: 0000000000350ef0 Call Trace: usbnet_start_xmit+0xfe5/0x2190 drivers/net/usb/usbnet.c:1453 __netdev_start_xmit include/linux/netdevice.h:4918 [inline] netdev_start_xmit include/linux/netdevice.h:4932 [inline] xmit_one net/core/dev.c:3578 [inline] dev_hard_start_xmit+0x187/0x700 net/core/dev.c:3594 ... This bug is caused by the fact that usbnet trusts the bulk endpoint addresses its probe routine receives in the driver_info structure, and it does not check to see that these endpoints actually exist and have the expected type and directions. The fix is simply to add such a check. Reported-and-tested-by: syzbot+63ee658b9a100ffadbe2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/linux-usb/000000000000a56e9105d0cec021@google.com/ Signed-off-by: Alan Stern CC: Oliver Neukum Link: https://lore.kernel.org/r/ea152b6d-44df-4f8a-95c6-4db51143dcc1@rowland.harvard.edu Signed-off-by: Jakub Kicinski --- drivers/net/usb/usbnet.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 283ffddda821..2d14b0d78541 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1775,6 +1775,10 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) } else if (!info->in || !info->out) status = usbnet_get_endpoints (dev, udev); else { + u8 ep_addrs[3] = { + info->in + USB_DIR_IN, info->out + USB_DIR_OUT, 0 + }; + dev->in = usb_rcvbulkpipe (xdev, info->in); dev->out = usb_sndbulkpipe (xdev, info->out); if (!(info->flags & FLAG_NO_SETINT)) @@ -1784,6 +1788,8 @@ usbnet_probe (struct usb_interface *udev, const struct usb_device_id *prod) else status = 0; + if (status == 0 && !usb_check_bulk_endpoints(udev, ep_addrs)) + status = -EINVAL; } if (status >= 0 && dev->status) status = init_status (dev, udev); -- cgit v1.2.3 From 9845217d60d01d151b45842ef2017a65e8f39f5a Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Wed, 12 Jul 2023 12:16:16 +0100 Subject: net: dsa: ar9331: Use explict flags for regmap single read/write The at9331 is only able to read or write a single register at once. The driver has a custom regmap bus and chooses to tell the regmap core about this by reporting the maximum transfer sizes rather than the explicit flags that exist at the regmap level. Since there are a number of problems with the raw transfer limits and the regmap level flags are better integrated anyway convert the driver to use the flags. No functional change. Signed-off-by: Mark Brown Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/dsa/qca/ar9331.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/qca/ar9331.c b/drivers/net/dsa/qca/ar9331.c index b2bf78ac485e..3b0937031499 100644 --- a/drivers/net/dsa/qca/ar9331.c +++ b/drivers/net/dsa/qca/ar9331.c @@ -1002,6 +1002,8 @@ static const struct regmap_config ar9331_mdio_regmap_config = { .val_bits = 32, .reg_stride = 4, .max_register = AR9331_SW_REG_PAGE, + .use_single_read = true, + .use_single_write = true, .ranges = ar9331_regmap_range, .num_ranges = ARRAY_SIZE(ar9331_regmap_range), @@ -1018,8 +1020,6 @@ static struct regmap_bus ar9331_sw_bus = { .val_format_endian_default = REGMAP_ENDIAN_NATIVE, .read = ar9331_mdio_read, .write = ar9331_sw_bus_write, - .max_raw_read = 4, - .max_raw_write = 4, }; static int ar9331_sw_probe(struct mdio_device *mdiodev) -- cgit v1.2.3 From b685f1a58956fa36cc01123f253351b25bfacfda Mon Sep 17 00:00:00 2001 From: Tanmay Patil Date: Wed, 12 Jul 2023 16:36:57 +0530 Subject: net: ethernet: ti: cpsw_ale: Fix cpsw_ale_get_field()/cpsw_ale_set_field() CPSW ALE has 75 bit ALE entries which are stored within three 32 bit words. The cpsw_ale_get_field() and cpsw_ale_set_field() functions assume that the field will be strictly contained within one word. However, this is not guaranteed to be the case and it is possible for ALE field entries to span across up to two words at the most. Fix the methods to handle getting/setting fields spanning up to two words. Fixes: db82173f23c5 ("netdev: driver: ethernet: add cpsw address lookup engine support") Signed-off-by: Tanmay Patil [s-vadapalli@ti.com: rephrased commit message and added Fixes tag] Signed-off-by: Siddharth Vadapalli Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpsw_ale.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw_ale.c b/drivers/net/ethernet/ti/cpsw_ale.c index 0c5e783e574c..64bf22cd860c 100644 --- a/drivers/net/ethernet/ti/cpsw_ale.c +++ b/drivers/net/ethernet/ti/cpsw_ale.c @@ -106,23 +106,37 @@ struct cpsw_ale_dev_id { static inline int cpsw_ale_get_field(u32 *ale_entry, u32 start, u32 bits) { - int idx; + int idx, idx2; + u32 hi_val = 0; idx = start / 32; + idx2 = (start + bits - 1) / 32; + /* Check if bits to be fetched exceed a word */ + if (idx != idx2) { + idx2 = 2 - idx2; /* flip */ + hi_val = ale_entry[idx2] << ((idx2 * 32) - start); + } start -= idx * 32; idx = 2 - idx; /* flip */ - return (ale_entry[idx] >> start) & BITMASK(bits); + return (hi_val + (ale_entry[idx] >> start)) & BITMASK(bits); } static inline void cpsw_ale_set_field(u32 *ale_entry, u32 start, u32 bits, u32 value) { - int idx; + int idx, idx2; value &= BITMASK(bits); - idx = start / 32; + idx = start / 32; + idx2 = (start + bits - 1) / 32; + /* Check if bits to be set exceed a word */ + if (idx != idx2) { + idx2 = 2 - idx2; /* flip */ + ale_entry[idx2] &= ~(BITMASK(bits + start - (idx2 * 32))); + ale_entry[idx2] |= (value >> ((idx2 * 32) - start)); + } start -= idx * 32; - idx = 2 - idx; /* flip */ + idx = 2 - idx; /* flip */ ale_entry[idx] &= ~(BITMASK(bits) << start); ale_entry[idx] |= (value << start); } -- cgit v1.2.3 From 56a16035bb6effb37177867cea94c13a8382f745 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 12 Jul 2023 08:44:49 -0700 Subject: bridge: Add extack warning when enabling STP in netns. When we create an L2 loop on a bridge in netns, we will see packets storm even if STP is enabled. # unshare -n # ip link add br0 type bridge # ip link add veth0 type veth peer name veth1 # ip link set veth0 master br0 up # ip link set veth1 master br0 up # ip link set br0 type bridge stp_state 1 # ip link set br0 up # sleep 30 # ip -s link show br0 2: br0: mtu 1500 qdisc noqueue state UP mode DEFAULT group default qlen 1000 link/ether b6:61:98:1c:1c:b5 brd ff:ff:ff:ff:ff:ff RX: bytes packets errors dropped missed mcast 956553768 12861249 0 0 0 12861249 <-. Keep TX: bytes packets errors dropped carrier collsns | increasing 1027834 11951 0 0 0 0 <-' rapidly This is because llc_rcv() drops all packets in non-root netns and BPDU is dropped. Let's add extack warning when enabling STP in netns. # unshare -n # ip link add br0 type bridge # ip link set br0 type bridge stp_state 1 Warning: bridge: STP does not work in non-root netns. Note this commit will be reverted later when we namespacify the whole LLC infra. Fixes: e730c15519d0 ("[NET]: Make packet reception network namespace safe") Suggested-by: Harry Coin Link: https://lore.kernel.org/netdev/0f531295-e289-022d-5add-5ceffa0df9bc@quietfountain.com/ Suggested-by: Ido Schimmel Signed-off-by: Kuniyuki Iwashima Acked-by: Nikolay Aleksandrov Reviewed-by: Ido Schimmel Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 75204d36d7f9..b65962682771 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -201,6 +201,9 @@ int br_stp_set_enabled(struct net_bridge *br, unsigned long val, { ASSERT_RTNL(); + if (!net_eq(dev_net(br->dev), &init_net)) + NL_SET_ERR_MSG_MOD(extack, "STP does not work in non-root netns"); + if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, "STP can't be enabled if MRP is already enabled"); -- cgit v1.2.3 From 1d6d537dc55d1f42d16290f00157ac387985b95b Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Thu, 13 Jul 2023 03:42:29 +0100 Subject: net: ethernet: mtk_eth_soc: handle probe deferral Move the call to of_get_ethdev_address to mtk_add_mac which is part of the probe function and can hence itself return -EPROBE_DEFER should of_get_ethdev_address return -EPROBE_DEFER. This allows us to entirely get rid of the mtk_init function. The problem of of_get_ethdev_address returning -EPROBE_DEFER surfaced in situations in which the NVMEM provider holding the MAC address has not yet be loaded at the time mtk_eth_soc is initially probed. In this case probing of mtk_eth_soc should be deferred instead of falling back to use a random MAC address, so once the NVMEM provider becomes available probing can be repeated. Fixes: 656e705243fd ("net-next: mediatek: add support for MT7623 ethernet") Signed-off-by: Daniel Golle Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 29 +++++++++++------------------ 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 834c644b67db..2d15342c260a 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -3846,23 +3846,6 @@ static int mtk_hw_deinit(struct mtk_eth *eth) return 0; } -static int __init mtk_init(struct net_device *dev) -{ - struct mtk_mac *mac = netdev_priv(dev); - struct mtk_eth *eth = mac->hw; - int ret; - - ret = of_get_ethdev_address(mac->of_node, dev); - if (ret) { - /* If the mac address is invalid, use random mac address */ - eth_hw_addr_random(dev); - dev_err(eth->dev, "generated random MAC address %pM\n", - dev->dev_addr); - } - - return 0; -} - static void mtk_uninit(struct net_device *dev) { struct mtk_mac *mac = netdev_priv(dev); @@ -4278,7 +4261,6 @@ static const struct ethtool_ops mtk_ethtool_ops = { }; static const struct net_device_ops mtk_netdev_ops = { - .ndo_init = mtk_init, .ndo_uninit = mtk_uninit, .ndo_open = mtk_open, .ndo_stop = mtk_stop, @@ -4340,6 +4322,17 @@ static int mtk_add_mac(struct mtk_eth *eth, struct device_node *np) mac->hw = eth; mac->of_node = np; + err = of_get_ethdev_address(mac->of_node, eth->netdev[id]); + if (err == -EPROBE_DEFER) + return err; + + if (err) { + /* If the mac address is invalid, use random mac address */ + eth_hw_addr_random(eth->netdev[id]); + dev_err(eth->dev, "generated random MAC address %pM\n", + eth->netdev[id]->dev_addr); + } + memset(mac->hwlro_ip, 0, sizeof(mac->hwlro_ip)); mac->hwlro_ip_cnt = 0; -- cgit v1.2.3 From 797311bce5c2ac90b8d65e357603cfd410d36ebb Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Tue, 11 Jul 2023 23:16:07 +0900 Subject: tracing/probes: Fix to record 0-length data_loc in fetch_store_string*() if fails Fix to record 0-length data to data_loc in fetch_store_string*() if it fails to get the string data. Currently those expect that the data_loc is updated by store_trace_args() if it returns the error code. However, that does not work correctly if the argument is an array of strings. In that case, store_trace_args() only clears the first entry of the array (which may have no error) and leaves other entries. So it should be cleared by fetch_store_string*() itself. Also, 'dyndata' and 'maxlen' in store_trace_args() should be updated only if it is used (ret > 0 and argument is a dynamic data.) Link: https://lore.kernel.org/all/168908496683.123124.4761206188794205601.stgit@devnote2/ Fixes: 40b53b771806 ("tracing: probeevent: Add array type support") Cc: stable@vger.kernel.org Reviewed-by: Steven Rostedt (Google) Signed-off-by: Masami Hiramatsu (Google) --- kernel/trace/trace_probe_kernel.h | 13 +++++++++---- kernel/trace/trace_probe_tmpl.h | 10 +++------- kernel/trace/trace_uprobe.c | 3 ++- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/kernel/trace/trace_probe_kernel.h b/kernel/trace/trace_probe_kernel.h index 6deae2ce34f8..bb723eefd7b7 100644 --- a/kernel/trace/trace_probe_kernel.h +++ b/kernel/trace/trace_probe_kernel.h @@ -37,6 +37,13 @@ fetch_store_strlen(unsigned long addr) return (ret < 0) ? ret : len; } +static nokprobe_inline void set_data_loc(int ret, void *dest, void *__dest, void *base) +{ + if (ret < 0) + ret = 0; + *(u32 *)dest = make_data_loc(ret, __dest - base); +} + /* * Fetch a null-terminated string from user. Caller MUST set *(u32 *)buf * with max length and relative data location. @@ -55,8 +62,7 @@ fetch_store_string_user(unsigned long addr, void *dest, void *base) __dest = get_loc_data(dest, base); ret = strncpy_from_user_nofault(__dest, uaddr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } @@ -87,8 +93,7 @@ fetch_store_string(unsigned long addr, void *dest, void *base) * probing. */ ret = strncpy_from_kernel_nofault(__dest, (void *)addr, maxlen); - if (ret >= 0) - *(u32 *)dest = make_data_loc(ret, __dest - base); + set_data_loc(ret, dest, __dest, base); return ret; } diff --git a/kernel/trace/trace_probe_tmpl.h b/kernel/trace/trace_probe_tmpl.h index 185da001f4c3..3935b347f874 100644 --- a/kernel/trace/trace_probe_tmpl.h +++ b/kernel/trace/trace_probe_tmpl.h @@ -267,13 +267,9 @@ store_trace_args(void *data, struct trace_probe *tp, void *rec, if (unlikely(arg->dynamic)) *dl = make_data_loc(maxlen, dyndata - base); ret = process_fetch_insn(arg->code, rec, dl, base); - if (arg->dynamic) { - if (unlikely(ret < 0)) { - *dl = make_data_loc(0, dyndata - base); - } else { - dyndata += ret; - maxlen -= ret; - } + if (arg->dynamic && likely(ret > 0)) { + dyndata += ret; + maxlen -= ret; } } } diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index 8b92e34ff0c8..7b47e9a2c010 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -170,7 +170,8 @@ fetch_store_string(unsigned long addr, void *dest, void *base) */ ret++; *(u32 *)dest = make_data_loc(ret, (void *)dst - base); - } + } else + *(u32 *)dest = make_data_loc(0, (void *)dst - base); return ret; } -- cgit v1.2.3 From 4ad23d2368ccce6da74edc74e69300a4d75a2379 Mon Sep 17 00:00:00 2001 From: Wang Ming Date: Thu, 13 Jul 2023 17:51:06 +0800 Subject: bna: Remove error checking for debugfs_create_dir() It is expected that most callers should _ignore_ the errors return by debugfs_create_dir() in bnad_debugfs_init(). Signed-off-by: Wang Ming Signed-off-by: David S. Miller --- drivers/net/ethernet/brocade/bna/bnad_debugfs.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c index 04ad0f2b9677..7246e13dd559 100644 --- a/drivers/net/ethernet/brocade/bna/bnad_debugfs.c +++ b/drivers/net/ethernet/brocade/bna/bnad_debugfs.c @@ -512,11 +512,6 @@ bnad_debugfs_init(struct bnad *bnad) if (!bnad->port_debugfs_root) { bnad->port_debugfs_root = debugfs_create_dir(name, bna_debugfs_root); - if (!bnad->port_debugfs_root) { - netdev_warn(bnad->netdev, - "debugfs root dir creation failed\n"); - return; - } atomic_inc(&bna_debugfs_port_count); -- cgit v1.2.3 From a822551c51f0dc063305ca16b9dd0dec7fe1f259 Mon Sep 17 00:00:00 2001 From: Wang Ming Date: Thu, 13 Jul 2023 20:16:03 +0800 Subject: net: ethernet: Remove repeating expression Identify issues that arise by using the tests/doublebitand.cocci semantic patch. Need to remove duplicate expression in if statement. Signed-off-by: Wang Ming Reviewed-by: Jiawen Wu Signed-off-by: David S. Miller --- drivers/net/ethernet/wangxun/libwx/wx_hw.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/wangxun/libwx/wx_hw.c b/drivers/net/ethernet/wangxun/libwx/wx_hw.c index 39a9aeee7aab..6321178fc814 100644 --- a/drivers/net/ethernet/wangxun/libwx/wx_hw.c +++ b/drivers/net/ethernet/wangxun/libwx/wx_hw.c @@ -1511,7 +1511,6 @@ static void wx_configure_rx(struct wx *wx) psrtype = WX_RDB_PL_CFG_L4HDR | WX_RDB_PL_CFG_L3HDR | WX_RDB_PL_CFG_L2HDR | - WX_RDB_PL_CFG_TUN_TUNHDR | WX_RDB_PL_CFG_TUN_TUNHDR; wr32(wx, WX_RDB_PL_CFG(0), psrtype); -- cgit v1.2.3 From 9840036786d90cea11a90d1f30b6dc003b34ee67 Mon Sep 17 00:00:00 2001 From: Yan Zhai Date: Thu, 13 Jul 2023 10:28:00 -0700 Subject: gso: fix dodgy bit handling for GSO_UDP_L4 Commit 1fd54773c267 ("udp: allow header check for dodgy GSO_UDP_L4 packets.") checks DODGY bit for UDP, but for packets that can be fed directly to the device after gso_segs reset, it actually falls through to fragmentation: https://lore.kernel.org/all/CAJPywTKDdjtwkLVUW6LRA2FU912qcDmQOQGt2WaDo28KzYDg+A@mail.gmail.com/ This change restores the expected behavior of GSO_UDP_L4 packets. Fixes: 1fd54773c267 ("udp: allow header check for dodgy GSO_UDP_L4 packets.") Suggested-by: Willem de Bruijn Signed-off-by: Yan Zhai Reviewed-by: Willem de Bruijn Acked-by: Jason Wang Signed-off-by: David S. Miller --- net/ipv4/udp_offload.c | 16 +++++++++++----- net/ipv6/udp_offload.c | 3 +-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 75aa4de5b731..f402946da344 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -274,13 +274,20 @@ struct sk_buff *__udp_gso_segment(struct sk_buff *gso_skb, __sum16 check; __be16 newlen; - if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) - return __udp_gso_segment_list(gso_skb, features, is_ipv6); - mss = skb_shinfo(gso_skb)->gso_size; if (gso_skb->len <= sizeof(*uh) + mss) return ERR_PTR(-EINVAL); + if (skb_gso_ok(gso_skb, features | NETIF_F_GSO_ROBUST)) { + /* Packet is from an untrusted source, reset gso_segs. */ + skb_shinfo(gso_skb)->gso_segs = DIV_ROUND_UP(gso_skb->len - sizeof(*uh), + mss); + return NULL; + } + + if (skb_shinfo(gso_skb)->gso_type & SKB_GSO_FRAGLIST) + return __udp_gso_segment_list(gso_skb, features, is_ipv6); + skb_pull(gso_skb, sizeof(*uh)); /* clear destructor to avoid skb_segment assigning it to tail */ @@ -388,8 +395,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && - !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) return __udp_gso_segment(skb, features, false); mss = skb_shinfo(skb)->gso_size; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index ad3b8726873e..09fa7a42cb93 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -43,8 +43,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, if (!pskb_may_pull(skb, sizeof(struct udphdr))) goto out; - if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4 && - !skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) + if (skb_shinfo(skb)->gso_type & SKB_GSO_UDP_L4) return __udp_gso_segment(skb, features, true); mss = skb_shinfo(skb)->gso_size; -- cgit v1.2.3 From 911476ef3c585da9fdf156fd7aaa7455455daf76 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Mon, 26 Jun 2023 12:13:11 -0300 Subject: iommu: Fix crash during syfs iommu_groups/N/type The err_restore_domain flow was accidently inserted into the success path in commit 1000dccd5d13 ("iommu: Allow IOMMU_RESV_DIRECT to work on ARM"). It should only happen if iommu_create_device_direct_mappings() fails. This caused the domains the be wrongly changed and freed whenever the sysfs is used, resulting in an oops: BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 1 PID: 3417 Comm: avocado Not tainted 6.4.0-rc4-next-20230602 #3 Hardware name: Dell Inc. PowerEdge R6515/07PXPY, BIOS 2.3.6 07/06/2021 RIP: 0010:__iommu_attach_device+0xc/0xa0 Code: c0 c3 cc cc cc cc 48 89 f0 c3 cc cc cc cc 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 41 54 55 48 8b 47 08 <48> 8b 00 48 85 c0 74 74 48 89 f5 e8 64 12 49 00 41 89 c4 85 c0 74 RSP: 0018:ffffabae0220bd48 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff9ac04f70e410 RCX: 0000000000000001 RDX: ffff9ac044db20c0 RSI: ffff9ac044fa50d0 RDI: ffff9ac04f70e410 RBP: ffff9ac044fa50d0 R08: 1000000100209001 R09: 00000000000002dc R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ac043d54700 R13: ffff9ac043d54700 R14: 0000000000000001 R15: 0000000000000001 FS: 00007f02e30ae000(0000) GS:ffff9afeb2440000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000012afca006 CR4: 0000000000770ee0 PKRU: 55555554 Call Trace: ? __die+0x24/0x70 ? page_fault_oops+0x82/0x150 ? __iommu_queue_command_sync+0x80/0xc0 ? exc_page_fault+0x69/0x150 ? asm_exc_page_fault+0x26/0x30 ? __iommu_attach_device+0xc/0xa0 ? __iommu_attach_device+0x1c/0xa0 __iommu_device_set_domain+0x42/0x80 __iommu_group_set_domain_internal+0x5d/0x160 iommu_setup_default_domain+0x318/0x400 iommu_group_store_type+0xb1/0x200 kernfs_fop_write_iter+0x12f/0x1c0 vfs_write+0x2a2/0x3b0 ksys_write+0x63/0xe0 do_syscall_64+0x3f/0x90 entry_SYSCALL_64_after_hwframe+0x6e/0xd8 RIP: 0033:0x7f02e2f14a6f Reorganize the error flow so that the success branch and error branches are clearer. Fixes: 1000dccd5d13 ("iommu: Allow IOMMU_RESV_DIRECT to work on ARM") Reported-by: Dheeraj Kumar Srivastava Tested-by: Vasant Hegde Signed-off-by: Jason Gunthorpe Reviewed-by: Lu Baolu Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/0-v1-5bd8cc969d9e+1f1-iommu_set_def_fix_jgg@nvidia.com Signed-off-by: Joerg Roedel --- drivers/iommu/iommu.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c index da340f11c5f5..caaf563d38ae 100644 --- a/drivers/iommu/iommu.c +++ b/drivers/iommu/iommu.c @@ -2891,14 +2891,11 @@ static int iommu_setup_default_domain(struct iommu_group *group, ret = __iommu_group_set_domain_internal( group, dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); if (WARN_ON(ret)) - goto out_free; + goto out_free_old; } else { ret = __iommu_group_set_domain(group, dom); - if (ret) { - iommu_domain_free(dom); - group->default_domain = old_dom; - return ret; - } + if (ret) + goto err_restore_def_domain; } /* @@ -2911,20 +2908,24 @@ static int iommu_setup_default_domain(struct iommu_group *group, for_each_group_device(group, gdev) { ret = iommu_create_device_direct_mappings(dom, gdev->dev); if (ret) - goto err_restore; + goto err_restore_domain; } } -err_restore: - if (old_dom) { +out_free_old: + if (old_dom) + iommu_domain_free(old_dom); + return ret; + +err_restore_domain: + if (old_dom) __iommu_group_set_domain_internal( group, old_dom, IOMMU_SET_DOMAIN_MUST_SUCCEED); +err_restore_def_domain: + if (old_dom) { iommu_domain_free(dom); - old_dom = NULL; + group->default_domain = old_dom; } -out_free: - if (old_dom) - iommu_domain_free(old_dom); return ret; } -- cgit v1.2.3 From c20ecf7bb6153149b81a9277eda23398957656f2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 6 Apr 2023 11:55:31 +0300 Subject: iommu/sva: Fix signedness bug in iommu_sva_alloc_pasid() The ida_alloc_range() function returns negative error codes on error. On success it returns values in the min to max range (inclusive). It never returns more then INT_MAX even if "max" is higher. It never returns values in the 0 to (min - 1) range. The bug is that "min" is an unsigned int so negative error codes will be promoted to high positive values errors treated as success. Fixes: 1a14bf0fc7ed ("iommu/sva: Use GFP_KERNEL for pasid allocation") Signed-off-by: Dan Carpenter Reviewed-by: Lu Baolu Link: https://lore.kernel.org/r/6b32095d-7491-4ebb-a850-12e96209eaaf@kili.mountain Signed-off-by: Joerg Roedel --- drivers/iommu/iommu-sva.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/iommu-sva.c b/drivers/iommu/iommu-sva.c index 3ebd4b6586b3..05c0fb2acbc4 100644 --- a/drivers/iommu/iommu-sva.c +++ b/drivers/iommu/iommu-sva.c @@ -34,8 +34,9 @@ static int iommu_sva_alloc_pasid(struct mm_struct *mm, ioasid_t min, ioasid_t ma } ret = ida_alloc_range(&iommu_global_pasid_ida, min, max, GFP_KERNEL); - if (ret < min) + if (ret < 0) goto out; + mm->pasid = ret; ret = 0; out: -- cgit v1.2.3 From 0e022f5bf72f9d7b6f77466b41a87c2076aa8d03 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jul 2023 09:52:36 -0300 Subject: perf beauty: Update copy of linux/socket.h with the kernel sources To pick the changes in: b848b26c6672c9b9 ("net: Kill MSG_SENDPAGE_NOTLAST") 5e2ff6704a275be0 ("scm: add SO_PASSPIDFD and SCM_PIDFD") 4fe38acdac8a71f7 ("net: Block MSG_SENDPAGE_* from being passed to sendmsg() by userspace") b841b901c452d926 ("net: Declare MSG_SPLICE_PAGES internal sendmsg() flag") That don't result in any changes in the tables generated from that header. But while updating I noticed we were not handling MSG_BATCH and MSG_ZEROCOPY in the hard coded table for the msg flags table, add them. This silences this perf build warning: Warning: Kernel ABI header differences: diff -u tools/perf/trace/beauty/include/linux/socket.h include/linux/socket.h Cc: Adrian Hunter Cc: Alexander Mikhalitsyn Cc: David Howells Cc: David S. Miller Cc: Ian Rogers Cc: Jakub Kicinski Cc: Jiri Olsa Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZLFGuHDwUGDGXdoR@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/trace/beauty/include/linux/socket.h | 5 +++++ tools/perf/trace/beauty/msg_flags.c | 8 ++++++++ 2 files changed, 13 insertions(+) diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h index 3bef212a24d7..39b74d83c7c4 100644 --- a/tools/perf/trace/beauty/include/linux/socket.h +++ b/tools/perf/trace/beauty/include/linux/socket.h @@ -177,6 +177,7 @@ static inline size_t msg_data_left(struct msghdr *msg) #define SCM_RIGHTS 0x01 /* rw: access rights (array of int) */ #define SCM_CREDENTIALS 0x02 /* rw: struct ucred */ #define SCM_SECURITY 0x03 /* rw: security label */ +#define SCM_PIDFD 0x04 /* ro: pidfd (int) */ struct ucred { __u32 pid; @@ -326,6 +327,7 @@ struct ucred { */ #define MSG_ZEROCOPY 0x4000000 /* Use user data in kernel path */ +#define MSG_SPLICE_PAGES 0x8000000 /* Splice the pages from the iterator in sendmsg() */ #define MSG_FASTOPEN 0x20000000 /* Send data in TCP SYN */ #define MSG_CMSG_CLOEXEC 0x40000000 /* Set close_on_exec for file descriptor received through @@ -336,6 +338,9 @@ struct ucred { #define MSG_CMSG_COMPAT 0 /* We never have 32 bit fixups */ #endif +/* Flags to be cleared on entry by sendmsg and sendmmsg syscalls */ +#define MSG_INTERNAL_SENDMSG_FLAGS \ + (MSG_SPLICE_PAGES | MSG_SENDPAGE_NOPOLICY | MSG_SENDPAGE_DECRYPTED) /* Setsockoptions(2) level. Thanks to BSD these must match IPPROTO_xxx */ #define SOL_IP 0 diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c index aa9934020232..ed3ff969b546 100644 --- a/tools/perf/trace/beauty/msg_flags.c +++ b/tools/perf/trace/beauty/msg_flags.c @@ -8,6 +8,12 @@ #ifndef MSG_WAITFORONE #define MSG_WAITFORONE 0x10000 #endif +#ifndef MSG_BATCH +#define MSG_BATCH 0x40000 +#endif +#ifndef MSG_ZEROCOPY +#define MSG_ZEROCOPY 0x4000000 +#endif #ifndef MSG_SPLICE_PAGES #define MSG_SPLICE_PAGES 0x8000000 #endif @@ -50,6 +56,8 @@ static size_t syscall_arg__scnprintf_msg_flags(char *bf, size_t size, P_MSG_FLAG(NOSIGNAL); P_MSG_FLAG(MORE); P_MSG_FLAG(WAITFORONE); + P_MSG_FLAG(BATCH); + P_MSG_FLAG(ZEROCOPY); P_MSG_FLAG(SPLICE_PAGES); P_MSG_FLAG(FASTOPEN); P_MSG_FLAG(CMSG_CLOEXEC); -- cgit v1.2.3 From 7b8615935560e1db55c962efa17a5cbdab349107 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jul 2023 10:00:22 -0300 Subject: tools include UAPI: Sync linux/vhost.h with the kernel sources To get the changes in: 228a27cf78afc63a ("vhost: Allow worker switching while work is queueing") c1ecd8e950079774 ("vhost: allow userspace to create workers") To pick up these changes and support them: $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > before $ cp include/uapi/linux/vhost.h tools/include/uapi/linux/vhost.h $ tools/perf/trace/beauty/vhost_virtio_ioctl.sh > after $ diff -u before after --- before 2023-07-14 09:58:14.268249807 -0300 +++ after 2023-07-14 09:58:23.041493892 -0300 @@ -10,6 +10,7 @@ [0x12] = "SET_VRING_BASE", [0x13] = "SET_VRING_ENDIAN", [0x14] = "GET_VRING_ENDIAN", + [0x15] = "ATTACH_VRING_WORKER", [0x20] = "SET_VRING_KICK", [0x21] = "SET_VRING_CALL", [0x22] = "SET_VRING_ERR", @@ -31,10 +32,12 @@ [0x7C] = "VDPA_SET_GROUP_ASID", [0x7D] = "VDPA_SUSPEND", [0x7E] = "VDPA_RESUME", + [0x9] = "FREE_WORKER", }; static const char *vhost_virtio_ioctl_read_cmds[] = { [0x00] = "GET_FEATURES", [0x12] = "GET_VRING_BASE", + [0x16] = "GET_VRING_WORKER", [0x26] = "GET_BACKEND_FEATURES", [0x70] = "VDPA_GET_DEVICE_ID", [0x71] = "VDPA_GET_STATUS", @@ -44,6 +47,7 @@ [0x79] = "VDPA_GET_CONFIG_SIZE", [0x7A] = "VDPA_GET_AS_NUM", [0x7B] = "VDPA_GET_VRING_GROUP", + [0x8] = "NEW_WORKER", [0x80] = "VDPA_GET_VQS_COUNT", [0x81] = "VDPA_GET_GROUP_NUM", }; $ For instance, see how those 'cmd' ioctl arguments get translated, now ATTACH_VRING_WORKER, GET_VRING_WORKER and NEW_WORKER, will be as well: # perf trace -a -e ioctl --max-events=10 0.000 ( 0.011 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1) = 0 21.353 ( 0.014 ms): pipewire/2261 ioctl(fd: 60, cmd: SNDRV_PCM_HWSYNC, arg: 0x1) = 0 25.766 ( 0.014 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740) = 0 25.845 ( 0.034 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70) = 0 25.916 ( 0.011 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0) = 0 25.941 ( 0.025 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ATOMIC, arg: 0x7ffe4a22c840) = 0 32.915 ( 0.009 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_RMFB, arg: 0x7ffe4a22cf9c) = 0 42.522 ( 0.013 ms): gnome-shell/2196 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffe4a22c740) = 0 42.579 ( 0.031 ms): gnome-shel:cs0/2212 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7fd43915dc70) = 0 42.644 ( 0.010 ms): gnome-shell/2196 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffe4a22c8a0) = 0 # Cc: Adrian Hunter Cc: Ian Rogers Cc: Jiri Olsa Cc: Michael S. Tsirkin Cc: Mike Christie Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZLFJ%2FRsDGYiaH5nj@kernel.org/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/linux/vhost.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/tools/include/uapi/linux/vhost.h b/tools/include/uapi/linux/vhost.h index 92e1b700b51c..f5c48b61ab62 100644 --- a/tools/include/uapi/linux/vhost.h +++ b/tools/include/uapi/linux/vhost.h @@ -45,6 +45,25 @@ #define VHOST_SET_LOG_BASE _IOW(VHOST_VIRTIO, 0x04, __u64) /* Specify an eventfd file descriptor to signal on log write. */ #define VHOST_SET_LOG_FD _IOW(VHOST_VIRTIO, 0x07, int) +/* By default, a device gets one vhost_worker that its virtqueues share. This + * command allows the owner of the device to create an additional vhost_worker + * for the device. It can later be bound to 1 or more of its virtqueues using + * the VHOST_ATTACH_VRING_WORKER command. + * + * This must be called after VHOST_SET_OWNER and the caller must be the owner + * of the device. The new thread will inherit caller's cgroups and namespaces, + * and will share the caller's memory space. The new thread will also be + * counted against the caller's RLIMIT_NPROC value. + * + * The worker's ID used in other commands will be returned in + * vhost_worker_state. + */ +#define VHOST_NEW_WORKER _IOR(VHOST_VIRTIO, 0x8, struct vhost_worker_state) +/* Free a worker created with VHOST_NEW_WORKER if it's not attached to any + * virtqueue. If userspace is not able to call this for workers its created, + * the kernel will free all the device's workers when the device is closed. + */ +#define VHOST_FREE_WORKER _IOW(VHOST_VIRTIO, 0x9, struct vhost_worker_state) /* Ring setup. */ /* Set number of descriptors in ring. This parameter can not @@ -70,6 +89,18 @@ #define VHOST_VRING_BIG_ENDIAN 1 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct vhost_vring_state) #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct vhost_vring_state) +/* Attach a vhost_worker created with VHOST_NEW_WORKER to one of the device's + * virtqueues. + * + * This will replace the virtqueue's existing worker. If the replaced worker + * is no longer attached to any virtqueues, it can be freed with + * VHOST_FREE_WORKER. + */ +#define VHOST_ATTACH_VRING_WORKER _IOW(VHOST_VIRTIO, 0x15, \ + struct vhost_vring_worker) +/* Return the vring worker's ID */ +#define VHOST_GET_VRING_WORKER _IOWR(VHOST_VIRTIO, 0x16, \ + struct vhost_vring_worker) /* The following ioctls use eventfd file descriptors to signal and poll * for events. */ -- cgit v1.2.3 From 28e898ffa0c6a0875319d9362e387509819c9907 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jul 2023 10:18:42 -0300 Subject: tools include UAPI: Sync the sound/asound.h copy with the kernel sources Picking the changes from: 01dfa8e969dbbc72 ("ALSA: ump: Add info flag bit for static blocks") e375b8a045873cf5 ("ALSA: ump: Add more attributes to UMP EP and FB info") 30fc139260d46e9b ("ALSA: ump: Add ioctls to inquiry UMP EP and Block info via control API") 127ae6f6dad2edb2 ("ALSA: rawmidi: Skip UMP devices at SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE") e3a8a5b726bdd903 ("ALSA: rawmidi: UMP support") a4bb75c4f19db711 ("ALSA: uapi: pcm: control the filling of the silence samples for drain") That harvests some new ioctls: $ tools/perf/trace/beauty/sndrv_ctl_ioctl.sh > before.ctl $ tools/perf/trace/beauty/sndrv_pcm_ioctl.sh > before.pcm $ cp include/uapi/sound/asound.h tools/include/uapi/sound/asound.h $ tools/perf/trace/beauty/sndrv_ctl_ioctl.sh > after.ctl $ tools/perf/trace/beauty/sndrv_pcm_ioctl.sh > after.pcm $ diff -u before.ctl after.ctl --- before.ctl 2023-07-14 10:17:00.319591889 -0300 +++ after.ctl 2023-07-14 10:17:24.668248373 -0300 @@ -22,6 +22,9 @@ [0x40] = "RAWMIDI_NEXT_DEVICE", [0x41] = "RAWMIDI_INFO", [0x42] = "RAWMIDI_PREFER_SUBDEVICE", + [0x43] = "UMP_NEXT_DEVICE", + [0x44] = "UMP_ENDPOINT_INFO", + [0x45] = "UMP_BLOCK_INFO", [0xd0] = "POWER", [0xd1] = "POWER_STATE", }; $ diff -u before.pcm after.pcm $ Now those will be decoded when they appear, see a system wide 'perf trace' session example here: # perf trace -e ioctl --max-events=10 0.000 ( 0.010 ms): gnome-shell/2240 ioctl(fd: 9, cmd: DRM_MODE_RMFB, arg: 0x7ffc0041d54c) = 0 2.444 ( 0.005 ms): wireplumber/2304 ioctl(fd: 47, cmd: TIOCOUTQ, arg: 0x7f16e9afea24) = 0 2.452 ( 0.002 ms): wireplumber/2304 ioctl(fd: 47, cmd: TIOCOUTQ, arg: 0x7f16e9afea24) = 0 11.348 ( 0.010 ms): gnome-shell/2240 ioctl(fd: 14, cmd: DRM_I915_IRQ_WAIT, arg: 0x7ffc0041ccf0) = 0 11.406 ( 0.037 ms): gnome-shel:cs0/2259 ioctl(fd: 14, cmd: DRM_I915_IRQ_EMIT, arg: 0x7f3cf69fdc60) = 0 11.476 ( 0.009 ms): gnome-shell/2240 ioctl(fd: 9, cmd: DRM_MODE_ADDFB2, arg: 0x7ffc0041ce50) = 0 11.497 ( 0.019 ms): gnome-shell/2240 ioctl(fd: 9, cmd: DRM_MODE_ATOMIC, arg: 0x7ffc0041cdf0) = 0 12.481 ( 0.020 ms): firefox:cs0/3651 ioctl(fd: 40, cmd: DRM_I915_IRQ_EMIT, arg: 0x7f1c365fea60) = 0 12.529 ( 0.009 ms): firefox:cs0/3651 ioctl(fd: 40, cmd: DRM_I915_IRQ_EMIT, arg: 0x7f1c365feab0) = 0 12.624 ( 0.018 ms): firefox:cs0/3651 ioctl(fd: 40, cmd: DRM_I915_IRQ_EMIT, arg: 0x7f1c365fea30) = 0 # Silencing these perf build warnings: Warning: Kernel ABI header differences: diff -u tools/include/uapi/sound/asound.h include/uapi/sound/asound.h Cc: Adrian Hunter Cc: Ian Rogers Cc: Jaroslav Kysela Cc: Jiri Olsa Cc: Namhyung Kim Cc: Takashi Iwai Link: https://lore.kernel.org/lkml/ZLFOrTE2+xZBgHGe@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/include/uapi/sound/asound.h | 81 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 79 insertions(+), 2 deletions(-) diff --git a/tools/include/uapi/sound/asound.h b/tools/include/uapi/sound/asound.h index 0aa955aa8246..f9939da41122 100644 --- a/tools/include/uapi/sound/asound.h +++ b/tools/include/uapi/sound/asound.h @@ -274,6 +274,7 @@ typedef int __bitwise snd_pcm_subformat_t; #define SNDRV_PCM_INFO_DOUBLE 0x00000004 /* Double buffering needed for PCM start/stop */ #define SNDRV_PCM_INFO_BATCH 0x00000010 /* double buffering */ #define SNDRV_PCM_INFO_SYNC_APPLPTR 0x00000020 /* need the explicit sync of appl_ptr update */ +#define SNDRV_PCM_INFO_PERFECT_DRAIN 0x00000040 /* silencing at the end of stream is not required */ #define SNDRV_PCM_INFO_INTERLEAVED 0x00000100 /* channels are interleaved */ #define SNDRV_PCM_INFO_NONINTERLEAVED 0x00000200 /* channels are not interleaved */ #define SNDRV_PCM_INFO_COMPLEX 0x00000400 /* complex frame organization (mmap only) */ @@ -383,6 +384,9 @@ typedef int snd_pcm_hw_param_t; #define SNDRV_PCM_HW_PARAMS_NORESAMPLE (1<<0) /* avoid rate resampling */ #define SNDRV_PCM_HW_PARAMS_EXPORT_BUFFER (1<<1) /* export buffer */ #define SNDRV_PCM_HW_PARAMS_NO_PERIOD_WAKEUP (1<<2) /* disable period wakeups */ +#define SNDRV_PCM_HW_PARAMS_NO_DRAIN_SILENCE (1<<3) /* suppress drain with the filling + * of the silence samples + */ struct snd_interval { unsigned int min, max; @@ -708,7 +712,7 @@ enum { * Raw MIDI section - /dev/snd/midi?? */ -#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 2) +#define SNDRV_RAWMIDI_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 4) enum { SNDRV_RAWMIDI_STREAM_OUTPUT = 0, @@ -719,6 +723,7 @@ enum { #define SNDRV_RAWMIDI_INFO_OUTPUT 0x00000001 #define SNDRV_RAWMIDI_INFO_INPUT 0x00000002 #define SNDRV_RAWMIDI_INFO_DUPLEX 0x00000004 +#define SNDRV_RAWMIDI_INFO_UMP 0x00000008 struct snd_rawmidi_info { unsigned int device; /* RO/WR (control): device number */ @@ -779,6 +784,72 @@ struct snd_rawmidi_status { }; #endif +/* UMP EP info flags */ +#define SNDRV_UMP_EP_INFO_STATIC_BLOCKS 0x01 + +/* UMP EP Protocol / JRTS capability bits */ +#define SNDRV_UMP_EP_INFO_PROTO_MIDI_MASK 0x0300 +#define SNDRV_UMP_EP_INFO_PROTO_MIDI1 0x0100 /* MIDI 1.0 */ +#define SNDRV_UMP_EP_INFO_PROTO_MIDI2 0x0200 /* MIDI 2.0 */ +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_MASK 0x0003 +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_TX 0x0001 /* JRTS Transmit */ +#define SNDRV_UMP_EP_INFO_PROTO_JRTS_RX 0x0002 /* JRTS Receive */ + +/* UMP Endpoint information */ +struct snd_ump_endpoint_info { + int card; /* card number */ + int device; /* device number */ + unsigned int flags; /* additional info */ + unsigned int protocol_caps; /* protocol capabilities */ + unsigned int protocol; /* current protocol */ + unsigned int num_blocks; /* # of function blocks */ + unsigned short version; /* UMP major/minor version */ + unsigned short family_id; /* MIDI device family ID */ + unsigned short model_id; /* MIDI family model ID */ + unsigned int manufacturer_id; /* MIDI manufacturer ID */ + unsigned char sw_revision[4]; /* software revision */ + unsigned short padding; + unsigned char name[128]; /* endpoint name string */ + unsigned char product_id[128]; /* unique product id string */ + unsigned char reserved[32]; +} __packed; + +/* UMP direction */ +#define SNDRV_UMP_DIR_INPUT 0x01 +#define SNDRV_UMP_DIR_OUTPUT 0x02 +#define SNDRV_UMP_DIR_BIDIRECTION 0x03 + +/* UMP block info flags */ +#define SNDRV_UMP_BLOCK_IS_MIDI1 (1U << 0) /* MIDI 1.0 port w/o restrict */ +#define SNDRV_UMP_BLOCK_IS_LOWSPEED (1U << 1) /* 31.25Kbps B/W MIDI1 port */ + +/* UMP block user-interface hint */ +#define SNDRV_UMP_BLOCK_UI_HINT_UNKNOWN 0x00 +#define SNDRV_UMP_BLOCK_UI_HINT_RECEIVER 0x01 +#define SNDRV_UMP_BLOCK_UI_HINT_SENDER 0x02 +#define SNDRV_UMP_BLOCK_UI_HINT_BOTH 0x03 + +/* UMP groups and blocks */ +#define SNDRV_UMP_MAX_GROUPS 16 +#define SNDRV_UMP_MAX_BLOCKS 32 + +/* UMP Block information */ +struct snd_ump_block_info { + int card; /* card number */ + int device; /* device number */ + unsigned char block_id; /* block ID (R/W) */ + unsigned char direction; /* UMP direction */ + unsigned char active; /* Activeness */ + unsigned char first_group; /* first group ID */ + unsigned char num_groups; /* number of groups */ + unsigned char midi_ci_version; /* MIDI-CI support version */ + unsigned char sysex8_streams; /* max number of sysex8 streams */ + unsigned char ui_hint; /* user interface hint */ + unsigned int flags; /* various info flags */ + unsigned char name[128]; /* block name string */ + unsigned char reserved[32]; +} __packed; + #define SNDRV_RAWMIDI_IOCTL_PVERSION _IOR('W', 0x00, int) #define SNDRV_RAWMIDI_IOCTL_INFO _IOR('W', 0x01, struct snd_rawmidi_info) #define SNDRV_RAWMIDI_IOCTL_USER_PVERSION _IOW('W', 0x02, int) @@ -786,6 +857,9 @@ struct snd_rawmidi_status { #define SNDRV_RAWMIDI_IOCTL_STATUS _IOWR('W', 0x20, struct snd_rawmidi_status) #define SNDRV_RAWMIDI_IOCTL_DROP _IOW('W', 0x30, int) #define SNDRV_RAWMIDI_IOCTL_DRAIN _IOW('W', 0x31, int) +/* Additional ioctls for UMP rawmidi devices */ +#define SNDRV_UMP_IOCTL_ENDPOINT_INFO _IOR('W', 0x40, struct snd_ump_endpoint_info) +#define SNDRV_UMP_IOCTL_BLOCK_INFO _IOR('W', 0x41, struct snd_ump_block_info) /* * Timer section - /dev/snd/timer @@ -961,7 +1035,7 @@ struct snd_timer_tread { * * ****************************************************************************/ -#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 8) +#define SNDRV_CTL_VERSION SNDRV_PROTOCOL_VERSION(2, 0, 9) struct snd_ctl_card_info { int card; /* card number */ @@ -1122,6 +1196,9 @@ struct snd_ctl_tlv { #define SNDRV_CTL_IOCTL_RAWMIDI_NEXT_DEVICE _IOWR('U', 0x40, int) #define SNDRV_CTL_IOCTL_RAWMIDI_INFO _IOWR('U', 0x41, struct snd_rawmidi_info) #define SNDRV_CTL_IOCTL_RAWMIDI_PREFER_SUBDEVICE _IOW('U', 0x42, int) +#define SNDRV_CTL_IOCTL_UMP_NEXT_DEVICE _IOWR('U', 0x43, int) +#define SNDRV_CTL_IOCTL_UMP_ENDPOINT_INFO _IOWR('U', 0x44, struct snd_ump_endpoint_info) +#define SNDRV_CTL_IOCTL_UMP_BLOCK_INFO _IOWR('U', 0x45, struct snd_ump_block_info) #define SNDRV_CTL_IOCTL_POWER _IOWR('U', 0xd0, int) #define SNDRV_CTL_IOCTL_POWER_STATE _IOR('U', 0xd1, int) -- cgit v1.2.3 From 963293ff058cef54718fd225a542c73778257b3c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jul 2023 10:36:16 -0300 Subject: tools headers arm64: Sync arm64's cputype.h with the kernel sources To get the changes in: e910baa9c1efdf76 ("KVM: arm64: vgic: Add Apple M2 PRO/MAX cpus to the list of broken SEIS implementations") That makes this perf source code to be rebuilt: CC /tmp/build/perf-tools/util/arm-spe.o The changes in the above patch don't affect things that are used in arm-spe.c (things like MIDR_NEOVERSE_N1, etc). Unsure if Apple M2 has SPE (Statistical Profiling Extension) :-) That addresses this perf build warning: Warning: Kernel ABI header differences: diff -u tools/arch/arm64/include/asm/cputype.h arch/arm64/include/asm/cputype.h Cc: Adrian Hunter Cc: Ali Saidi Cc: Ian Rogers Cc: Jiri Olsa Cc: Leo Yan Cc: Marc Zyngier Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ Signed-off-by: Arnaldo Carvalho de Melo --- tools/arch/arm64/include/asm/cputype.h | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tools/arch/arm64/include/asm/cputype.h b/tools/arch/arm64/include/asm/cputype.h index 683ca3af4084..5f6f84837a49 100644 --- a/tools/arch/arm64/include/asm/cputype.h +++ b/tools/arch/arm64/include/asm/cputype.h @@ -126,6 +126,10 @@ #define APPLE_CPU_PART_M1_FIRESTORM_MAX 0x029 #define APPLE_CPU_PART_M2_BLIZZARD 0x032 #define APPLE_CPU_PART_M2_AVALANCHE 0x033 +#define APPLE_CPU_PART_M2_BLIZZARD_PRO 0x034 +#define APPLE_CPU_PART_M2_AVALANCHE_PRO 0x035 +#define APPLE_CPU_PART_M2_BLIZZARD_MAX 0x038 +#define APPLE_CPU_PART_M2_AVALANCHE_MAX 0x039 #define AMPERE_CPU_PART_AMPERE1 0xAC3 @@ -181,6 +185,10 @@ #define MIDR_APPLE_M1_FIRESTORM_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM_MAX) #define MIDR_APPLE_M2_BLIZZARD MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD) #define MIDR_APPLE_M2_AVALANCHE MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE) +#define MIDR_APPLE_M2_BLIZZARD_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_PRO) +#define MIDR_APPLE_M2_AVALANCHE_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_PRO) +#define MIDR_APPLE_M2_BLIZZARD_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_BLIZZARD_MAX) +#define MIDR_APPLE_M2_AVALANCHE_MAX MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M2_AVALANCHE_MAX) #define MIDR_AMPERE1 MIDR_CPU_MODEL(ARM_CPU_IMP_AMPERE, AMPERE_CPU_PART_AMPERE1) /* Fujitsu Erratum 010001 affects A64FX 1.0 and 1.1, (v0r0 and v1r0) */ -- cgit v1.2.3 From 9f87fc4d72f52b26ac3e19df5e4584227fe6740c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jul 2023 16:30:14 +0200 Subject: block: queue data commands from the flush state machine at the head We used to insert the data commands following a pre-flush to the head of the queue until commit 1e82fadfc6b ("blk-mq: do not do head insertions post-pre-flush commands"). Not doing this seems to cause hangs of such commands on NFS workloads when exported from file systems with SATA SSDs. I have no idea why this would starve these workloads, but doing a semantic revert of this patch (which looks quite different due to various other changes) fixes the hangs. Fixes: 1e82fadfc6b ("blk-mq: do not do head insertions post-pre-flush commands") Reported-by: Chuck Lever Signed-off-by: Christoph Hellwig Tested-by: Chuck Lever Link: https://lore.kernel.org/r/20230714143014.11879-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-flush.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-flush.c b/block/blk-flush.c index dba392cf22be..8220517c2d67 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -189,7 +189,7 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); spin_lock(&q->requeue_lock); - list_add_tail(&rq->queuelist, &q->flush_list); + list_add(&rq->queuelist, &q->requeue_list); spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; -- cgit v1.2.3 From c071b34f62ddbf8435491ebb0e21eba9dc29f901 Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 14 Jul 2023 08:56:34 +0000 Subject: cifs: is_network_name_deleted should return a bool Currently, is_network_name_deleted and it's implementations do not return anything if the network name did get deleted. So the function doesn't fully achieve what it advertizes. Changed the function to return a bool instead. It will now return true if the error returned is STATUS_NETWORK_NAME_DELETED and the share (tree id) was found to be connected. It returns false otherwise. Signed-off-by: Shyam Prasad N Acked-by: Paulo Alcantara (SUSE) Signed-off-by: Steve French --- fs/smb/client/cifsglob.h | 2 +- fs/smb/client/connect.c | 11 ++++++++--- fs/smb/client/smb2ops.c | 8 +++++--- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index b5808fe3469a..e5eec6d38d02 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -532,7 +532,7 @@ struct smb_version_operations { /* Check for STATUS_IO_TIMEOUT */ bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ - void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); + bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); }; struct smb_version_values { diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 85dd1b373974..aad0edc5d5d1 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -1226,9 +1226,14 @@ next_pdu: if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; - if (bufs[i] && server->ops->is_network_name_deleted) - server->ops->is_network_name_deleted(bufs[i], - server); + if (bufs[i] != NULL) { + if (server->ops->is_network_name_deleted && + server->ops->is_network_name_deleted(bufs[i], + server)) { + cifs_server_dbg(FYI, + "Share deleted. Reconnect needed"); + } + } if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 87abce010974..0f62bc373ad0 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -2395,7 +2395,7 @@ smb2_is_status_io_timeout(char *buf) return false; } -static void +static bool smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) { struct smb2_hdr *shdr = (struct smb2_hdr *)buf; @@ -2404,7 +2404,7 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) struct cifs_tcon *tcon; if (shdr->Status != STATUS_NETWORK_NAME_DELETED) - return; + return false; /* If server is a channel, select the primary channel */ pserver = CIFS_SERVER_IS_CHAN(server) ? server->primary_server : server; @@ -2419,11 +2419,13 @@ smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) spin_unlock(&cifs_tcp_ses_lock); pr_warn_once("Server share %s deleted.\n", tcon->tree_name); - return; + return true; } } } spin_unlock(&cifs_tcp_ses_lock); + + return false; } static int -- cgit v1.2.3 From f13be0ad5344b46bb89323ea5576a2417c6d1e55 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Thu, 13 Jul 2023 14:54:33 +0100 Subject: selftests/mm: give scripts execute permission MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When run under run_vmtests.sh, test scripts were failing to run with "permission denied" due to the scripts not being executable. It is also annoying not to be able to directly invoke run_vmtests.sh, which is solved by giving also it the execute permission. Link: https://lkml.kernel.org/r/20230713135440.3651409-3-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Cc: SeongJae Park Cc: David Hildenbrand Cc: Florent Revest Cc: Jérôme Glisse Cc: John Hubbard Cc: Liam R. Howlett Cc: Mark Brown Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/charge_reserved_hugetlb.sh | 0 tools/testing/selftests/mm/check_config.sh | 0 tools/testing/selftests/mm/hugetlb_reparenting_test.sh | 0 tools/testing/selftests/mm/run_vmtests.sh | 0 tools/testing/selftests/mm/test_hmm.sh | 0 tools/testing/selftests/mm/test_vmalloc.sh | 0 tools/testing/selftests/mm/va_high_addr_switch.sh | 0 tools/testing/selftests/mm/write_hugetlb_memory.sh | 0 8 files changed, 0 insertions(+), 0 deletions(-) mode change 100644 => 100755 tools/testing/selftests/mm/charge_reserved_hugetlb.sh mode change 100644 => 100755 tools/testing/selftests/mm/check_config.sh mode change 100644 => 100755 tools/testing/selftests/mm/hugetlb_reparenting_test.sh mode change 100644 => 100755 tools/testing/selftests/mm/run_vmtests.sh mode change 100644 => 100755 tools/testing/selftests/mm/test_hmm.sh mode change 100644 => 100755 tools/testing/selftests/mm/test_vmalloc.sh mode change 100644 => 100755 tools/testing/selftests/mm/va_high_addr_switch.sh mode change 100644 => 100755 tools/testing/selftests/mm/write_hugetlb_memory.sh diff --git a/tools/testing/selftests/mm/charge_reserved_hugetlb.sh b/tools/testing/selftests/mm/charge_reserved_hugetlb.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/check_config.sh b/tools/testing/selftests/mm/check_config.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/hugetlb_reparenting_test.sh b/tools/testing/selftests/mm/hugetlb_reparenting_test.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/test_hmm.sh b/tools/testing/selftests/mm/test_hmm.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/test_vmalloc.sh b/tools/testing/selftests/mm/test_vmalloc.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/va_high_addr_switch.sh b/tools/testing/selftests/mm/va_high_addr_switch.sh old mode 100644 new mode 100755 diff --git a/tools/testing/selftests/mm/write_hugetlb_memory.sh b/tools/testing/selftests/mm/write_hugetlb_memory.sh old mode 100644 new mode 100755 -- cgit v1.2.3 From 69cba9d3c1284e0838ae408830a02c4a063104bc Mon Sep 17 00:00:00 2001 From: Shyam Prasad N Date: Fri, 14 Jul 2023 08:56:33 +0000 Subject: cifs: fix mid leak during reconnection after timeout threshold When the number of responses with status of STATUS_IO_TIMEOUT exceeds a specified threshold (NUM_STATUS_IO_TIMEOUT), we reconnect the connection. But we do not return the mid, or the credits returned for the mid, or reduce the number of in-flight requests. This bug could result in the server->in_flight count to go bad, and also cause a leak in the mids. This change moves the check to a few lines below where the response is decrypted, even of the response is read from the transform header. This way, the code for returning the mids can be reused. Also, the cifs_reconnect was reconnecting just the transport connection before. In case of multi-channel, this may not be what we want to do after several timeouts. Changed that to reconnect the session and the tree too. Also renamed NUM_STATUS_IO_TIMEOUT to a more appropriate name MAX_STATUS_IO_TIMEOUT. Fixes: 8e670f77c4a5 ("Handle STATUS_IO_TIMEOUT gracefully") Signed-off-by: Shyam Prasad N Signed-off-by: Steve French --- fs/smb/client/connect.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index aad0edc5d5d1..9280e253bf09 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -60,7 +60,7 @@ extern bool disable_legacy_dialects; #define TLINK_IDLE_EXPIRE (600 * HZ) /* Drop the connection to not overload the server */ -#define NUM_STATUS_IO_TIMEOUT 5 +#define MAX_STATUS_IO_TIMEOUT 5 static int ip_connect(struct TCP_Server_Info *server); static int generic_ip_connect(struct TCP_Server_Info *server); @@ -1117,6 +1117,7 @@ cifs_demultiplex_thread(void *p) struct mid_q_entry *mids[MAX_COMPOUND]; char *bufs[MAX_COMPOUND]; unsigned int noreclaim_flag, num_io_timeout = 0; + bool pending_reconnect = false; noreclaim_flag = memalloc_noreclaim_save(); cifs_dbg(FYI, "Demultiplex PID: %d\n", task_pid_nr(current)); @@ -1156,6 +1157,8 @@ cifs_demultiplex_thread(void *p) cifs_dbg(FYI, "RFC1002 header 0x%x\n", pdu_length); if (!is_smb_response(server, buf[0])) continue; + + pending_reconnect = false; next_pdu: server->pdu_size = pdu_length; @@ -1213,10 +1216,13 @@ next_pdu: if (server->ops->is_status_io_timeout && server->ops->is_status_io_timeout(buf)) { num_io_timeout++; - if (num_io_timeout > NUM_STATUS_IO_TIMEOUT) { - cifs_reconnect(server, false); + if (num_io_timeout > MAX_STATUS_IO_TIMEOUT) { + cifs_server_dbg(VFS, + "Number of request timeouts exceeded %d. Reconnecting", + MAX_STATUS_IO_TIMEOUT); + + pending_reconnect = true; num_io_timeout = 0; - continue; } } @@ -1268,6 +1274,11 @@ next_pdu: buf = server->smallbuf; goto next_pdu; } + + /* do this reconnect at the very end after processing all MIDs */ + if (pending_reconnect) + cifs_reconnect(server, true); + } /* end while !EXITING */ /* buffer usually freed in free_mid - need to free it here on exit */ -- cgit v1.2.3 From 24a3298ac9e6bd8de838ab79f7868207170d556d Mon Sep 17 00:00:00 2001 From: Petr Oros Date: Mon, 19 Jun 2023 12:58:13 +0200 Subject: ice: Unregister netdev and devlink_port only once Since commit 6624e780a577fc ("ice: split ice_vsi_setup into smaller functions") ice_vsi_release does things twice. There is unregister netdev which is unregistered in ice_deinit_eth also. It also unregisters the devlink_port twice which is also unregistered in ice_deinit_eth(). This double deregistration is hidden because devl_port_unregister ignores the return value of xa_erase. [ 68.642167] Call Trace: [ 68.650385] ice_devlink_destroy_pf_port+0xe/0x20 [ice] [ 68.655656] ice_vsi_release+0x445/0x690 [ice] [ 68.660147] ice_deinit+0x99/0x280 [ice] [ 68.664117] ice_remove+0x1b6/0x5c0 [ice] [ 171.103841] Call Trace: [ 171.109607] ice_devlink_destroy_pf_port+0xf/0x20 [ice] [ 171.114841] ice_remove+0x158/0x270 [ice] [ 171.118854] pci_device_remove+0x3b/0xc0 [ 171.122779] device_release_driver_internal+0xc7/0x170 [ 171.127912] driver_detach+0x54/0x8c [ 171.131491] bus_remove_driver+0x77/0xd1 [ 171.135406] pci_unregister_driver+0x2d/0xb0 [ 171.139670] ice_module_exit+0xc/0x55f [ice] Fixes: 6624e780a577 ("ice: split ice_vsi_setup into smaller functions") Signed-off-by: Petr Oros Reviewed-by: Maciej Fijalkowski Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_lib.c | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 00e3afd507a4..0054d7e64ec3 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -2972,39 +2972,12 @@ int ice_vsi_release(struct ice_vsi *vsi) return -ENODEV; pf = vsi->back; - /* do not unregister while driver is in the reset recovery pending - * state. Since reset/rebuild happens through PF service task workqueue, - * it's not a good idea to unregister netdev that is associated to the - * PF that is running the work queue items currently. This is done to - * avoid check_flush_dependency() warning on this wq - */ - if (vsi->netdev && !ice_is_reset_in_progress(pf->state) && - (test_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state))) { - unregister_netdev(vsi->netdev); - clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state); - } - - if (vsi->type == ICE_VSI_PF) - ice_devlink_destroy_pf_port(pf); - if (test_bit(ICE_FLAG_RSS_ENA, pf->flags)) ice_rss_clean(vsi); ice_vsi_close(vsi); ice_vsi_decfg(vsi); - if (vsi->netdev) { - if (test_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state)) { - unregister_netdev(vsi->netdev); - clear_bit(ICE_VSI_NETDEV_REGISTERED, vsi->state); - } - if (test_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state)) { - free_netdev(vsi->netdev); - vsi->netdev = NULL; - clear_bit(ICE_VSI_NETDEV_ALLOCD, vsi->state); - } - } - /* retain SW VSI data structure since it is needed to unregister and * free VSI netdev when PF is not in reset recovery pending state,\ * for ex: during rmmod. -- cgit v1.2.3 From b3e7b3a6ee92ab927f750a6b19615ce88ece808f Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Thu, 6 Jul 2023 08:25:51 +0200 Subject: ice: prevent NULL pointer deref during reload Calling ethtool during reload can lead to call trace, because VSI isn't configured for some time, but netdev is alive. To fix it add rtnl lock for VSI deconfig and config. Set ::num_q_vectors to 0 after freeing and add a check for ::tx/rx_rings in ring related ethtool ops. Add proper unroll of filters in ice_start_eth(). Reproduction: $watch -n 0.1 -d 'ethtool -g enp24s0f0np0' $devlink dev reload pci/0000:18:00.0 action driver_reinit Call trace before fix: [66303.926205] BUG: kernel NULL pointer dereference, address: 0000000000000000 [66303.926259] #PF: supervisor read access in kernel mode [66303.926286] #PF: error_code(0x0000) - not-present page [66303.926311] PGD 0 P4D 0 [66303.926332] Oops: 0000 [#1] PREEMPT SMP PTI [66303.926358] CPU: 4 PID: 933821 Comm: ethtool Kdump: loaded Tainted: G OE 6.4.0-rc5+ #1 [66303.926400] Hardware name: Intel Corporation S2600WFT/S2600WFT, BIOS SE5C620.86B.00.01.0014.070920180847 07/09/2018 [66303.926446] RIP: 0010:ice_get_ringparam+0x22/0x50 [ice] [66303.926649] Code: 90 90 90 90 90 90 90 90 f3 0f 1e fa 0f 1f 44 00 00 48 8b 87 c0 09 00 00 c7 46 04 e0 1f 00 00 c7 46 10 e0 1f 00 00 48 8b 50 20 <48> 8b 12 0f b7 52 3a 89 56 14 48 8b 40 28 48 8b 00 0f b7 40 58 48 [66303.926722] RSP: 0018:ffffad40472f39c8 EFLAGS: 00010246 [66303.926749] RAX: ffff98a8ada05828 RBX: ffff98a8c46dd060 RCX: ffffad40472f3b48 [66303.926781] RDX: 0000000000000000 RSI: ffff98a8c46dd068 RDI: ffff98a8b23c4000 [66303.926811] RBP: ffffad40472f3b48 R08: 00000000000337b0 R09: 0000000000000000 [66303.926843] R10: 0000000000000001 R11: 0000000000000100 R12: ffff98a8b23c4000 [66303.926874] R13: ffff98a8c46dd060 R14: 000000000000000f R15: ffffad40472f3a50 [66303.926906] FS: 00007f6397966740(0000) GS:ffff98b390900000(0000) knlGS:0000000000000000 [66303.926941] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [66303.926967] CR2: 0000000000000000 CR3: 000000011ac20002 CR4: 00000000007706e0 [66303.926999] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [66303.927029] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [66303.927060] PKRU: 55555554 [66303.927075] Call Trace: [66303.927094] [66303.927111] ? __die+0x23/0x70 [66303.927140] ? page_fault_oops+0x171/0x4e0 [66303.927176] ? exc_page_fault+0x7f/0x180 [66303.927209] ? asm_exc_page_fault+0x26/0x30 [66303.927244] ? ice_get_ringparam+0x22/0x50 [ice] [66303.927433] rings_prepare_data+0x62/0x80 [66303.927469] ethnl_default_doit+0xe2/0x350 [66303.927501] genl_family_rcv_msg_doit.isra.0+0xe3/0x140 [66303.927538] genl_rcv_msg+0x1b1/0x2c0 [66303.927561] ? __pfx_ethnl_default_doit+0x10/0x10 [66303.927590] ? __pfx_genl_rcv_msg+0x10/0x10 [66303.927615] netlink_rcv_skb+0x58/0x110 [66303.927644] genl_rcv+0x28/0x40 [66303.927665] netlink_unicast+0x19e/0x290 [66303.927691] netlink_sendmsg+0x254/0x4d0 [66303.927717] sock_sendmsg+0x93/0xa0 [66303.927743] __sys_sendto+0x126/0x170 [66303.927780] __x64_sys_sendto+0x24/0x30 [66303.928593] do_syscall_64+0x5d/0x90 [66303.929370] ? __count_memcg_events+0x60/0xa0 [66303.930146] ? count_memcg_events.constprop.0+0x1a/0x30 [66303.930920] ? handle_mm_fault+0x9e/0x350 [66303.931688] ? do_user_addr_fault+0x258/0x740 [66303.932452] ? exc_page_fault+0x7f/0x180 [66303.933193] entry_SYSCALL_64_after_hwframe+0x72/0xdc Fixes: 5b246e533d01 ("ice: split probe into smaller functions") Reviewed-by: Przemek Kitszel Signed-off-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/ice/ice_base.c | 2 ++ drivers/net/ethernet/intel/ice/ice_ethtool.c | 13 +++++++++++-- drivers/net/ethernet/intel/ice/ice_main.c | 10 ++++++++-- 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c index 4a12316f7b46..b678bdf96f3a 100644 --- a/drivers/net/ethernet/intel/ice/ice_base.c +++ b/drivers/net/ethernet/intel/ice/ice_base.c @@ -800,6 +800,8 @@ void ice_vsi_free_q_vectors(struct ice_vsi *vsi) ice_for_each_q_vector(vsi, v_idx) ice_free_q_vector(vsi, v_idx); + + vsi->num_q_vectors = 0; } /** diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index 8d5cbbd0b3d5..ad4d4702129f 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -2681,8 +2681,13 @@ ice_get_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, ring->rx_max_pending = ICE_MAX_NUM_DESC; ring->tx_max_pending = ICE_MAX_NUM_DESC; - ring->rx_pending = vsi->rx_rings[0]->count; - ring->tx_pending = vsi->tx_rings[0]->count; + if (vsi->tx_rings && vsi->rx_rings) { + ring->rx_pending = vsi->rx_rings[0]->count; + ring->tx_pending = vsi->tx_rings[0]->count; + } else { + ring->rx_pending = 0; + ring->tx_pending = 0; + } /* Rx mini and jumbo rings are not supported */ ring->rx_mini_max_pending = 0; @@ -2716,6 +2721,10 @@ ice_set_ringparam(struct net_device *netdev, struct ethtool_ringparam *ring, return -EINVAL; } + /* Return if there is no rings (device is reloading) */ + if (!vsi->tx_rings || !vsi->rx_rings) + return -EBUSY; + new_tx_cnt = ALIGN(ring->tx_pending, ICE_REQ_DESC_MULTIPLE); if (new_tx_cnt != ring->tx_pending) netdev_info(netdev, "Requested Tx descriptor count rounded up to %d\n", diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 19a5e7f3a075..f02d44455772 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -4430,9 +4430,9 @@ static int ice_start_eth(struct ice_vsi *vsi) if (err) return err; - rtnl_lock(); err = ice_vsi_open(vsi); - rtnl_unlock(); + if (err) + ice_fltr_remove_all(vsi); return err; } @@ -4895,6 +4895,7 @@ int ice_load(struct ice_pf *pf) params = ice_vsi_to_params(vsi); params.flags = ICE_VSI_FLAG_INIT; + rtnl_lock(); err = ice_vsi_cfg(vsi, ¶ms); if (err) goto err_vsi_cfg; @@ -4902,6 +4903,7 @@ int ice_load(struct ice_pf *pf) err = ice_start_eth(ice_get_main_vsi(pf)); if (err) goto err_start_eth; + rtnl_unlock(); err = ice_init_rdma(pf); if (err) @@ -4916,9 +4918,11 @@ int ice_load(struct ice_pf *pf) err_init_rdma: ice_vsi_close(ice_get_main_vsi(pf)); + rtnl_lock(); err_start_eth: ice_vsi_decfg(ice_get_main_vsi(pf)); err_vsi_cfg: + rtnl_unlock(); ice_deinit_dev(pf); return err; } @@ -4931,8 +4935,10 @@ void ice_unload(struct ice_pf *pf) { ice_deinit_features(pf); ice_deinit_rdma(pf); + rtnl_lock(); ice_stop_eth(ice_get_main_vsi(pf)); ice_vsi_decfg(ice_get_main_vsi(pf)); + rtnl_unlock(); ice_deinit_dev(pf); } -- cgit v1.2.3 From c77896b143d3c9c3e84c4ed0662b807ccbd8730b Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Wed, 12 Jul 2023 12:35:14 -0700 Subject: selftests/riscv: fix potential build failure during the "emit_tests" step The riscv selftests (which were modeled after the arm64 selftests) are improperly declaring the "emit_tests" target to depend upon the "all" target. This approach, when combined with commit 9fc96c7c19df ("selftests: error out if kernel header files are not yet built"), has caused build failures [1] on arm64, and is likely to cause similar failures for riscv. To fix this, simply remove the unnecessary "all" dependency from the emit_tests target. The dependency is still effectively honored, because again, invocation is via "install", which also depends upon "all". An alternative approach would be to harden the emit_tests target so that it can depend upon "all", but that's a lot more complicated and hard to get right, and doesn't seem worth it, especially given that emit_tests should probably not be overridden at all. [1] https://lore.kernel.org/20230710-kselftest-fix-arm64-v1-1-48e872844f25@kernel.org Fixes: 9fc96c7c19df ("selftests: error out if kernel header files are not yet built") Signed-off-by: John Hubbard Tested-by: Alexandre Ghiti Signed-off-by: Shuah Khan --- tools/testing/selftests/riscv/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/riscv/Makefile b/tools/testing/selftests/riscv/Makefile index 9dd629cc86aa..f4b3d5c9af5b 100644 --- a/tools/testing/selftests/riscv/Makefile +++ b/tools/testing/selftests/riscv/Makefile @@ -43,7 +43,7 @@ run_tests: all done # Avoid any output on non riscv on emit_tests -emit_tests: all +emit_tests: @for DIR in $(RISCV_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ $(MAKE) OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ -- cgit v1.2.3 From 569f8b501b177f21121d483a96491716ab8905f4 Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Mon, 10 Jul 2023 17:56:29 -0700 Subject: selftests/arm64: fix build failure during the "emit_tests" step The build failure reported in [1] occurred because commit <9fc96c7c19df> ("selftests: error out if kernel header files are not yet built") added a new "kernel_header_files" dependency to "all", and that triggered another, pre-existing problem. Specifically, the arm64 selftests override the emit_tests target, and that override improperly declares itself to depend upon the "all" target. This is a problem because the "emit_tests" target in lib.mk was not intended to be overridden. emit_tests is a very simple, sequential build target that was originally invoked from the "install" target, which in turn, depends upon "all". That approach worked for years. But with 9fc96c7c19df in place, emit_tests failed, because it does not set up all of the elaborate things that "install" does. And that caused the new "kernel_header_files" target (which depends upon $(KBUILD_OUTPUT) being correct) to fail. Some detail: The "all" target is .PHONY. Therefore, each target that depends on "all" will cause it to be invoked again, and because dependencies are managed quite loosely in the selftests Makefiles, many things will run, even "all" is invoked several times in immediate succession. So this is not a "real" failure, as far as build steps go: everything gets built, but "all" reports a problem when invoked a second time from a bad environment. To fix this, simply remove the unnecessary "all" dependency from the overridden emit_tests target. The dependency is still effectively honored, because again, invocation is via "install", which also depends upon "all". An alternative approach would be to harden the emit_tests target so that it can depend upon "all", but that's a lot more complicated and hard to get right, and doesn't seem worth it, especially given that emit_tests should probably not be overridden at all. [1] https://lore.kernel.org/20230710-kselftest-fix-arm64-v1-1-48e872844f25@kernel.org Fixes: 9fc96c7c19df ("selftests: error out if kernel header files are not yet built") Reported-by: Mark Brown Signed-off-by: John Hubbard Tested-by: Mark Brown Signed-off-by: Shuah Khan --- tools/testing/selftests/arm64/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile index 9460cbe81bcc..ace8b67fb22d 100644 --- a/tools/testing/selftests/arm64/Makefile +++ b/tools/testing/selftests/arm64/Makefile @@ -42,7 +42,7 @@ run_tests: all done # Avoid any output on non arm64 on emit_tests -emit_tests: all +emit_tests: @for DIR in $(ARM64_SUBTARGETS); do \ BUILD_TARGET=$(OUTPUT)/$$DIR; \ make OUTPUT=$$BUILD_TARGET -C $$DIR $@; \ -- cgit v1.2.3 From a66557c790208610bdc21c0c788e7b9524b1a356 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:19 -0700 Subject: net: bonding: remove kernel-doc comment marker Change an errant kernel-doc comment marker (/**) to a regular comment to prevent a kernel-doc warning. bonding.h:282: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Returns NULL if the net_device does not belong to any of the bond's slaves Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Cc: Jay Vosburgh Cc: Andy Gospodarek Link: https://lore.kernel.org/r/20230714045127.18752-2-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/bonding.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/bonding.h b/include/net/bonding.h index b57bec6e737e..30ac427cf0c6 100644 --- a/include/net/bonding.h +++ b/include/net/bonding.h @@ -277,7 +277,7 @@ struct bond_vlan_tag { unsigned short vlan_id; }; -/** +/* * Returns NULL if the net_device does not belong to any of the bond's slaves * * Caller must hold bond lock for read -- cgit v1.2.3 From a63e40444e1bdb2ccdcd7c2e4afa51fdbc6c8589 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:20 -0700 Subject: net: cfg802154: fix kernel-doc notation warnings Add an enum heading to the kernel-doc comments to prevent kernel-doc warnings. cfg802154.h:174: warning: Cannot understand * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support on line 174 - I thought it was a doc line cfg802154.h:192: warning: Enum value 'WPAN_PHY_FLAG_TXPOWER' not described in enum 'wpan_phy_flags' cfg802154.h:192: warning: Excess enum value 'WPAN_PHY_FLAG_TRANSMIT_POWER' description in 'wpan_phy_flags' Fixes: edea8f7c75ec ("cfg802154: introduce wpan phy flags") Signed-off-by: Randy Dunlap Cc: Alexander Aring Cc: Stefan Schmidt Cc: Marcel Holtmann Acked-by: Miquel Raynal Link: https://lore.kernel.org/r/20230714045127.18752-3-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/cfg802154.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/net/cfg802154.h b/include/net/cfg802154.h index e00057984489..f79ce133e51a 100644 --- a/include/net/cfg802154.h +++ b/include/net/cfg802154.h @@ -170,7 +170,8 @@ wpan_phy_cca_cmp(const struct wpan_phy_cca *a, const struct wpan_phy_cca *b) } /** - * @WPAN_PHY_FLAG_TRANSMIT_POWER: Indicates that transceiver will support + * enum wpan_phy_flags - WPAN PHY state flags + * @WPAN_PHY_FLAG_TXPOWER: Indicates that transceiver will support * transmit power setting. * @WPAN_PHY_FLAG_CCA_ED_LEVEL: Indicates that transceiver will support cca ed * level setting. -- cgit v1.2.3 From cfe57122bba5dc16ad64b162d97e42923fc7587e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:21 -0700 Subject: codel: fix kernel-doc notation warnings Use '@' before the struct member names in kernel-doc notation to prevent kernel-doc warnings. codel.h:158: warning: Function parameter or member 'ecn_mark' not described in 'codel_stats' codel.h:158: warning: Function parameter or member 'ce_mark' not described in 'codel_stats' Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Signed-off-by: Randy Dunlap Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Cc: Dave Taht Link: https://lore.kernel.org/r/20230714045127.18752-4-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/codel.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/codel.h b/include/net/codel.h index 5fed2f16cb8d..aa80f744826c 100644 --- a/include/net/codel.h +++ b/include/net/codel.h @@ -145,8 +145,8 @@ struct codel_vars { * @maxpacket: largest packet we've seen so far * @drop_count: temp count of dropped packets in dequeue() * @drop_len: bytes of dropped packets in dequeue() - * ecn_mark: number of packets we ECN marked instead of dropping - * ce_mark: number of packets CE marked because sojourn time was above ce_threshold + * @ecn_mark: number of packets we ECN marked instead of dropping + * @ce_mark: number of packets CE marked because sojourn time was above ce_threshold */ struct codel_stats { u32 maxpacket; -- cgit v1.2.3 From 839f55c5ebdfc2c0825bc7e711a7dedaca11f84e Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:22 -0700 Subject: devlink: fix kernel-doc notation warnings Spell function or struct member names correctly. Use ':' instead of '-' for struct member entries. Mark one field as private in kernel-doc. Add a few entries that were missing. Fix a typo. These changes prevent kernel-doc warnings: devlink.h:252: warning: Function parameter or member 'field_id' not described in 'devlink_dpipe_match' devlink.h:267: warning: Function parameter or member 'field_id' not described in 'devlink_dpipe_action' devlink.h:310: warning: Function parameter or member 'match_values_count' not described in 'devlink_dpipe_entry' devlink.h:355: warning: Function parameter or member 'list' not described in 'devlink_dpipe_table' devlink.h:374: warning: Function parameter or member 'actions_dump' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'matches_dump' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'entries_dump' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'counters_set_update' not described in 'devlink_dpipe_table_ops' devlink.h:374: warning: Function parameter or member 'size_get' not described in 'devlink_dpipe_table_ops' devlink.h:384: warning: Function parameter or member 'headers' not described in 'devlink_dpipe_headers' devlink.h:384: warning: Function parameter or member 'headers_count' not described in 'devlink_dpipe_headers' devlink.h:398: warning: Function parameter or member 'unit' not described in 'devlink_resource_size_params' devlink.h:487: warning: Function parameter or member 'id' not described in 'devlink_param' devlink.h:645: warning: Function parameter or member 'overwrite_mask' not described in 'devlink_flash_update_params' Fixes: 1555d204e743 ("devlink: Support for pipeline debug (dpipe)") Fixes: d9f9b9a4d05f ("devlink: Add support for resource abstraction") Fixes: eabaef1896bc ("devlink: Add devlink_param register and unregister") Fixes: 5d5b4128c4ca ("devlink: introduce flash update overwrite mask") Signed-off-by: Randy Dunlap Cc: Jiri Pirko Cc: Moshe Shemesh Cc: Jacob Keller Link: https://lore.kernel.org/r/20230714045127.18752-5-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/devlink.h | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/include/net/devlink.h b/include/net/devlink.h index 9a3c51aa6e81..0cdb4b16e5b5 100644 --- a/include/net/devlink.h +++ b/include/net/devlink.h @@ -221,7 +221,7 @@ struct devlink_dpipe_field { /** * struct devlink_dpipe_header - dpipe header object * @name: header name - * @id: index, global/local detrmined by global bit + * @id: index, global/local determined by global bit * @fields: fields * @fields_count: number of fields * @global: indicates if header is shared like most protocol header @@ -241,7 +241,7 @@ struct devlink_dpipe_header { * @header_index: header index (packets can have several headers of same * type like in case of tunnels) * @header: header - * @fieled_id: field index + * @field_id: field index */ struct devlink_dpipe_match { enum devlink_dpipe_match_type type; @@ -256,7 +256,7 @@ struct devlink_dpipe_match { * @header_index: header index (packets can have several headers of same * type like in case of tunnels) * @header: header - * @fieled_id: field index + * @field_id: field index */ struct devlink_dpipe_action { enum devlink_dpipe_action_type type; @@ -292,7 +292,7 @@ struct devlink_dpipe_value { * struct devlink_dpipe_entry - table entry object * @index: index of the entry in the table * @match_values: match values - * @matche_values_count: count of matches tuples + * @match_values_count: count of matches tuples * @action_values: actions values * @action_values_count: count of actions values * @counter: value of counter @@ -342,7 +342,9 @@ struct devlink_dpipe_table_ops; */ struct devlink_dpipe_table { void *priv; + /* private: */ struct list_head list; + /* public: */ const char *name; bool counters_enabled; bool counter_control_extern; @@ -355,13 +357,13 @@ struct devlink_dpipe_table { /** * struct devlink_dpipe_table_ops - dpipe_table ops - * @actions_dump - dumps all tables actions - * @matches_dump - dumps all tables matches - * @entries_dump - dumps all active entries in the table - * @counters_set_update - when changing the counter status hardware sync + * @actions_dump: dumps all tables actions + * @matches_dump: dumps all tables matches + * @entries_dump: dumps all active entries in the table + * @counters_set_update: when changing the counter status hardware sync * maybe needed to allocate/free counter related * resources - * @size_get - get size + * @size_get: get size */ struct devlink_dpipe_table_ops { int (*actions_dump)(void *priv, struct sk_buff *skb); @@ -374,8 +376,8 @@ struct devlink_dpipe_table_ops { /** * struct devlink_dpipe_headers - dpipe headers - * @headers - header array can be shared (global bit) or driver specific - * @headers_count - count of headers + * @headers: header array can be shared (global bit) or driver specific + * @headers_count: count of headers */ struct devlink_dpipe_headers { struct devlink_dpipe_header **headers; @@ -387,7 +389,7 @@ struct devlink_dpipe_headers { * @size_min: minimum size which can be set * @size_max: maximum size which can be set * @size_granularity: size granularity - * @size_unit: resource's basic unit + * @unit: resource's basic unit */ struct devlink_resource_size_params { u64 size_min; @@ -457,6 +459,7 @@ struct devlink_flash_notify { /** * struct devlink_param - devlink configuration parameter data + * @id: devlink parameter id number * @name: name of the parameter * @generic: indicates if the parameter is generic or driver specific * @type: parameter type @@ -632,6 +635,7 @@ enum devlink_param_generic_id { * struct devlink_flash_update_params - Flash Update parameters * @fw: pointer to the firmware data to update from * @component: the flash component to update + * @overwrite_mask: which types of flash update are supported (may be %0) * * With the exception of fw, drivers must opt-in to parameters by * setting the appropriate bit in the supported_flash_update_params field in -- cgit v1.2.3 From d20909a0689f585fb6443da2d8797f7ad549f931 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:23 -0700 Subject: inet: frags: eliminate kernel-doc warning Modify the anonymous enum kernel-doc content so that it doesn't cause a kernel-doc warning. inet_frag.h:33: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst Fixes: 1ab1934ed80a ("inet: frags: enum the flag definitions and add descriptions") Signed-off-by: Randy Dunlap Cc: Nikolay Aleksandrov Link: https://lore.kernel.org/r/20230714045127.18752-6-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/inet_frag.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 325ad893f624..153960663ce4 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -29,7 +29,7 @@ struct fqdir { }; /** - * fragment queue flags + * enum: fragment queue flags * * @INET_FRAG_FIRST_IN: first fragment has arrived * @INET_FRAG_LAST_IN: final fragment has arrived -- cgit v1.2.3 From 201a08830d8c4698f97ce65329ba92e36b7f402a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:24 -0700 Subject: net: llc: fix kernel-doc notation warnings Use the corrent function parameter name or format to prevent kernel-doc warnings. Add 2 function parameter descriptions to prevent kernel-doc warnings. llc_pdu.h:278: warning: Function parameter or member 'da' not described in 'llc_pdu_decode_da' llc_pdu.h:278: warning: Excess function parameter 'sa' description in 'llc_pdu_decode_da' llc_pdu.h:330: warning: Function parameter or member 'skb' not described in 'llc_pdu_init_as_test_cmd' llc_pdu.h:379: warning: Function parameter or member 'svcs_supported' not described in 'llc_pdu_init_as_xid_cmd' llc_pdu.h:379: warning: Function parameter or member 'rx_window' not described in 'llc_pdu_init_as_xid_cmd' Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Randy Dunlap Link: https://lore.kernel.org/r/20230714045127.18752-7-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/llc_pdu.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h index 49aa79c7b278..7e73f8e5e497 100644 --- a/include/net/llc_pdu.h +++ b/include/net/llc_pdu.h @@ -269,7 +269,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) /** * llc_pdu_decode_da - extracts dest address of input frame * @skb: input skb that destination address must be extracted from it - * @sa: pointer to destination address (6 byte array). + * @da: pointer to destination address (6 byte array). * * This function extracts destination address(MAC) of input frame. */ @@ -321,7 +321,7 @@ static inline void llc_pdu_init_as_ui_cmd(struct sk_buff *skb) /** * llc_pdu_init_as_test_cmd - sets PDU as TEST - * @skb - Address of the skb to build + * @skb: Address of the skb to build * * Sets a PDU as TEST */ @@ -369,6 +369,8 @@ struct llc_xid_info { /** * llc_pdu_init_as_xid_cmd - sets bytes 3, 4 & 5 of LLC header as XID * @skb: input skb that header must be set into it. + * @svcs_supported: The class of the LLC (I or II) + * @rx_window: The size of the receive window of the LLC * * This function sets third,fourth,fifth and sixth bytes of LLC header as * a XID PDU. -- cgit v1.2.3 From d1533d726aa1efca3a7ae8f40c94ccb149d22e6a Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:25 -0700 Subject: net: NSH: fix kernel-doc notation warning Use the struct member's name and the correct format to prevent a kernel-doc warning. nsh.h:200: warning: Function parameter or member 'context' not described in 'nsh_md1_ctx' Fixes: 1f0b7744c505 ("net: add NSH header structures and helpers") Signed-off-by: Randy Dunlap Cc: Jiri Benc Link: https://lore.kernel.org/r/20230714045127.18752-8-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/nsh.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/nsh.h b/include/net/nsh.h index 350b1ad11c7f..16a751093896 100644 --- a/include/net/nsh.h +++ b/include/net/nsh.h @@ -192,7 +192,7 @@ /** * struct nsh_md1_ctx - Keeps track of NSH context data - * @nshc<1-4>: NSH Contexts. + * @context: NSH Contexts. */ struct nsh_md1_ctx { __be32 context[4]; -- cgit v1.2.3 From d1cca974548d76e0fa8b6761d25f7b47376a3780 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:26 -0700 Subject: pie: fix kernel-doc notation warning Spell a struct member's name correctly to prevent a kernel-doc warning. pie.h:38: warning: Function parameter or member 'tupdate' not described in 'pie_params' Fixes: b42a3d7c7cff ("pie: improve comments and commenting style") Signed-off-by: Randy Dunlap Cc: Leslie Monis Cc: "Mohit P. Tahiliani" Cc: Gautam Ramakrishnan Cc: Jamal Hadi Salim Cc: Cong Wang Cc: Jiri Pirko Link: https://lore.kernel.org/r/20230714045127.18752-9-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/pie.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/pie.h b/include/net/pie.h index 3fe2361e03b4..01cbc66825a4 100644 --- a/include/net/pie.h +++ b/include/net/pie.h @@ -17,7 +17,7 @@ /** * struct pie_params - contains pie parameters * @target: target delay in pschedtime - * @tudpate: interval at which drop probability is calculated + * @tupdate: interval at which drop probability is calculated * @limit: total number of packets that can be in the queue * @alpha: parameter to control drop probability * @beta: parameter to control drop probability -- cgit v1.2.3 From 04be3c95da8266a099c8446b6d1205ccf8a62e66 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 13 Jul 2023 21:51:27 -0700 Subject: rsi: remove kernel-doc comment marker Change an errant kernel-doc comment marker (/**) to a regular comment to prevent a kernel-doc warning. rsi_91x.h:3: warning: This comment starts with '/**', but isn't a kernel-doc comment. Refer Documentation/doc-guide/kernel-doc.rst * Copyright (c) 2017 Redpine Signals Inc. Fixes: 4c10d56a76bb ("rsi: add header file rsi_91x") Signed-off-by: Randy Dunlap Cc: Prameela Rani Garnepudi Cc: Siva Rebbagondla Acked-by: Kalle Valo Link: https://lore.kernel.org/r/20230714045127.18752-10-rdunlap@infradead.org Signed-off-by: Jakub Kicinski --- include/net/rsi_91x.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/net/rsi_91x.h b/include/net/rsi_91x.h index 040f07b47f1f..b2283762e446 100644 --- a/include/net/rsi_91x.h +++ b/include/net/rsi_91x.h @@ -1,4 +1,4 @@ -/** +/* * Copyright (c) 2017 Redpine Signals Inc. * * Permission to use, copy, modify, and/or distribute this software for any -- cgit v1.2.3 From fdf0eaf11452d72945af31804e2a1048ee1b574c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 16 Jul 2023 15:10:37 -0700 Subject: Linux 6.5-rc2 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 47690c28456a..658ec2b8aa74 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 5 SUBLEVEL = 0 -EXTRAVERSION = -rc1 +EXTRAVERSION = -rc2 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* -- cgit v1.2.3 From 98e2dd5f7a8be5cb2501a897e96910393a49f0ff Mon Sep 17 00:00:00 2001 From: Martin Fuzzey Date: Fri, 16 Jun 2023 16:36:28 +0200 Subject: regulator: da9063: fix null pointer deref with partial DT config When some of the da9063 regulators do not have corresponding DT nodes a null pointer dereference occurs on boot because such regulators have no init_data causing the pointers calculated in da9063_check_xvp_constraints() to be invalid. Do not dereference them in this case. Fixes: b8717a80e6ee ("regulator: da9063: implement setter for voltage monitoring") Signed-off-by: Martin Fuzzey Link: https://lore.kernel.org/r/20230616143736.2946173-1-martin.fuzzey@flowbird.group Signed-off-by: Mark Brown --- drivers/regulator/da9063-regulator.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/regulator/da9063-regulator.c b/drivers/regulator/da9063-regulator.c index c5dd77be558b..dfd5ec9f75c9 100644 --- a/drivers/regulator/da9063-regulator.c +++ b/drivers/regulator/da9063-regulator.c @@ -778,6 +778,9 @@ static int da9063_check_xvp_constraints(struct regulator_config *config) const struct notification_limit *uv_l = &constr->under_voltage_limits; const struct notification_limit *ov_l = &constr->over_voltage_limits; + if (!config->init_data) /* No config in DT, pointers will be invalid */ + return 0; + /* make sure that only one severity is used to clarify if unchanged, enabled or disabled */ if ((!!uv_l->prot + !!uv_l->err + !!uv_l->warn) > 1) { dev_err(config->dev, "%s: at most one voltage monitoring severity allowed!\n", -- cgit v1.2.3 From b3d0e0489430735e2e7626aa37e6462cdd136e9d Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 13 Jul 2023 15:05:10 -0300 Subject: net: sched: cls_matchall: Undo tcf_bind_filter in case of failure after mall_set_parms In case an error occurred after mall_set_parms executed successfully, we must undo the tcf_bind_filter call it issues. Fix that by calling tcf_unbind_filter in err_replace_hw_filter label. Fixes: ec2507d2a306 ("net/sched: cls_matchall: Fix error path") Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_matchall.c | 35 ++++++++++++----------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index fa3bbd187eb9..c4ed11df6254 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -159,26 +159,6 @@ static const struct nla_policy mall_policy[TCA_MATCHALL_MAX + 1] = { [TCA_MATCHALL_FLAGS] = { .type = NLA_U32 }, }; -static int mall_set_parms(struct net *net, struct tcf_proto *tp, - struct cls_mall_head *head, - unsigned long base, struct nlattr **tb, - struct nlattr *est, u32 flags, u32 fl_flags, - struct netlink_ext_ack *extack) -{ - int err; - - err = tcf_exts_validate_ex(net, tp, tb, est, &head->exts, flags, - fl_flags, extack); - if (err < 0) - return err; - - if (tb[TCA_MATCHALL_CLASSID]) { - head->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); - tcf_bind_filter(tp, &head->res, base); - } - return 0; -} - static int mall_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -187,6 +167,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, { struct cls_mall_head *head = rtnl_dereference(tp->root); struct nlattr *tb[TCA_MATCHALL_MAX + 1]; + bool bound_to_filter = false; struct cls_mall_head *new; u32 userflags = 0; int err; @@ -226,11 +207,17 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, goto err_alloc_percpu; } - err = mall_set_parms(net, tp, new, base, tb, tca[TCA_RATE], - flags, new->flags, extack); - if (err) + err = tcf_exts_validate_ex(net, tp, tb, tca[TCA_RATE], + &new->exts, flags, new->flags, extack); + if (err < 0) goto err_set_parms; + if (tb[TCA_MATCHALL_CLASSID]) { + new->res.classid = nla_get_u32(tb[TCA_MATCHALL_CLASSID]); + tcf_bind_filter(tp, &new->res, base); + bound_to_filter = true; + } + if (!tc_skip_hw(new->flags)) { err = mall_replace_hw_filter(tp, new, (unsigned long)new, extack); @@ -246,6 +233,8 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, return 0; err_replace_hw_filter: + if (bound_to_filter) + tcf_unbind_filter(tp, &new->res); err_set_parms: free_percpu(new->pf); err_alloc_percpu: -- cgit v1.2.3 From 9cb36faedeafb9720ac236aeae2ea57091d90a09 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 13 Jul 2023 15:05:11 -0300 Subject: net: sched: cls_u32: Undo tcf_bind_filter if u32_replace_hw_knode When u32_replace_hw_knode fails, we need to undo the tcf_bind_filter operation done at u32_set_parms. Fixes: d34e3e181395 ("net: cls_u32: Add support for skip-sw flag to tc u32 classifier.") Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index d15d50de7980..ed358466d042 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -712,8 +712,23 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = { [TCA_U32_FLAGS] = { .type = NLA_U32 }, }; +static void u32_unbind_filter(struct tcf_proto *tp, struct tc_u_knode *n, + struct nlattr **tb) +{ + if (tb[TCA_U32_CLASSID]) + tcf_unbind_filter(tp, &n->res); +} + +static void u32_bind_filter(struct tcf_proto *tp, struct tc_u_knode *n, + unsigned long base, struct nlattr **tb) +{ + if (tb[TCA_U32_CLASSID]) { + n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); + tcf_bind_filter(tp, &n->res, base); + } +} + static int u32_set_parms(struct net *net, struct tcf_proto *tp, - unsigned long base, struct tc_u_knode *n, struct nlattr **tb, struct nlattr *est, u32 flags, u32 fl_flags, struct netlink_ext_ack *extack) @@ -760,10 +775,6 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, if (ht_old) ht_old->refcnt--; } - if (tb[TCA_U32_CLASSID]) { - n->res.classid = nla_get_u32(tb[TCA_U32_CLASSID]); - tcf_bind_filter(tp, &n->res, base); - } if (ifindex >= 0) n->ifindex = ifindex; @@ -903,17 +914,20 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (!new) return -ENOMEM; - err = u32_set_parms(net, tp, base, new, tb, - tca[TCA_RATE], flags, new->flags, - extack); + err = u32_set_parms(net, tp, new, tb, tca[TCA_RATE], + flags, new->flags, extack); if (err) { __u32_destroy_key(new); return err; } + u32_bind_filter(tp, new, base, tb); + err = u32_replace_hw_knode(tp, new, flags, extack); if (err) { + u32_unbind_filter(tp, new, tb); + __u32_destroy_key(new); return err; } @@ -1074,15 +1088,18 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - err = u32_set_parms(net, tp, base, n, tb, tca[TCA_RATE], + err = u32_set_parms(net, tp, n, tb, tca[TCA_RATE], flags, n->flags, extack); + + u32_bind_filter(tp, n, base, tb); + if (err == 0) { struct tc_u_knode __rcu **ins; struct tc_u_knode *pins; err = u32_replace_hw_knode(tp, n, flags, extack); if (err) - goto errhw; + goto errunbind; if (!tc_in_hw(n->flags)) n->flags |= TCA_CLS_FLAGS_NOT_IN_HW; @@ -1100,7 +1117,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return 0; } -errhw: +errunbind: + u32_unbind_filter(tp, n, tb); + #ifdef CONFIG_CLS_U32_MARK free_percpu(n->pcpu_success); #endif -- cgit v1.2.3 From e8d3d78c19be0264a5692bed477c303523aead31 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 13 Jul 2023 15:05:12 -0300 Subject: net: sched: cls_u32: Undo refcount decrement in case update failed In the case of an update, when TCA_U32_LINK is set, u32_set_parms will decrement the refcount of the ht_down (struct tc_u_hnode) pointer present in the older u32 filter which we are replacing. However, if u32_replace_hw_knode errors out, the update command fails and that ht_down pointer continues decremented. To fix that, when u32_replace_hw_knode fails, check if ht_down's refcount was decremented and undo the decrement. Fixes: d34e3e181395 ("net: cls_u32: Add support for skip-sw flag to tc u32 classifier.") Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_u32.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index ed358466d042..5abf31e432ca 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -928,6 +928,13 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, if (err) { u32_unbind_filter(tp, new, tb); + if (tb[TCA_U32_LINK]) { + struct tc_u_hnode *ht_old; + + ht_old = rtnl_dereference(n->ht_down); + if (ht_old) + ht_old->refcnt++; + } __u32_destroy_key(new); return err; } -- cgit v1.2.3 From 26a22194927e8521e304ed75c2f38d8068d55fc7 Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 13 Jul 2023 15:05:13 -0300 Subject: net: sched: cls_bpf: Undo tcf_bind_filter in case of an error If cls_bpf_offload errors out, we must also undo tcf_bind_filter that was done before the error. Fix that by calling tcf_unbind_filter in errout_parms. Fixes: eadb41489fd2 ("net: cls_bpf: add support for marking filters as hardware-only") Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_bpf.c | 99 +++++++++++++++++++++++++---------------------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index 466c26df853a..382c7a71f81f 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -406,56 +406,6 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog, return 0; } -static int cls_bpf_set_parms(struct net *net, struct tcf_proto *tp, - struct cls_bpf_prog *prog, unsigned long base, - struct nlattr **tb, struct nlattr *est, u32 flags, - struct netlink_ext_ack *extack) -{ - bool is_bpf, is_ebpf, have_exts = false; - u32 gen_flags = 0; - int ret; - - is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; - is_ebpf = tb[TCA_BPF_FD]; - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) - return -EINVAL; - - ret = tcf_exts_validate(net, tp, tb, est, &prog->exts, flags, - extack); - if (ret < 0) - return ret; - - if (tb[TCA_BPF_FLAGS]) { - u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); - - if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) - return -EINVAL; - - have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; - } - if (tb[TCA_BPF_FLAGS_GEN]) { - gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); - if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || - !tc_flags_valid(gen_flags)) - return -EINVAL; - } - - prog->exts_integrated = have_exts; - prog->gen_flags = gen_flags; - - ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : - cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); - if (ret < 0) - return ret; - - if (tb[TCA_BPF_CLASSID]) { - prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); - tcf_bind_filter(tp, &prog->res, base); - } - - return 0; -} - static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct tcf_proto *tp, unsigned long base, u32 handle, struct nlattr **tca, @@ -463,9 +413,12 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, struct netlink_ext_ack *extack) { struct cls_bpf_head *head = rtnl_dereference(tp->root); + bool is_bpf, is_ebpf, have_exts = false; struct cls_bpf_prog *oldprog = *arg; struct nlattr *tb[TCA_BPF_MAX + 1]; + bool bound_to_filter = false; struct cls_bpf_prog *prog; + u32 gen_flags = 0; int ret; if (tca[TCA_OPTIONS] == NULL) @@ -504,11 +457,51 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, goto errout; prog->handle = handle; - ret = cls_bpf_set_parms(net, tp, prog, base, tb, tca[TCA_RATE], flags, - extack); + is_bpf = tb[TCA_BPF_OPS_LEN] && tb[TCA_BPF_OPS]; + is_ebpf = tb[TCA_BPF_FD]; + if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { + ret = -EINVAL; + goto errout_idr; + } + + ret = tcf_exts_validate(net, tp, tb, tca[TCA_RATE], &prog->exts, + flags, extack); + if (ret < 0) + goto errout_idr; + + if (tb[TCA_BPF_FLAGS]) { + u32 bpf_flags = nla_get_u32(tb[TCA_BPF_FLAGS]); + + if (bpf_flags & ~TCA_BPF_FLAG_ACT_DIRECT) { + ret = -EINVAL; + goto errout_idr; + } + + have_exts = bpf_flags & TCA_BPF_FLAG_ACT_DIRECT; + } + if (tb[TCA_BPF_FLAGS_GEN]) { + gen_flags = nla_get_u32(tb[TCA_BPF_FLAGS_GEN]); + if (gen_flags & ~CLS_BPF_SUPPORTED_GEN_FLAGS || + !tc_flags_valid(gen_flags)) { + ret = -EINVAL; + goto errout_idr; + } + } + + prog->exts_integrated = have_exts; + prog->gen_flags = gen_flags; + + ret = is_bpf ? cls_bpf_prog_from_ops(tb, prog) : + cls_bpf_prog_from_efd(tb, prog, gen_flags, tp); if (ret < 0) goto errout_idr; + if (tb[TCA_BPF_CLASSID]) { + prog->res.classid = nla_get_u32(tb[TCA_BPF_CLASSID]); + tcf_bind_filter(tp, &prog->res, base); + bound_to_filter = true; + } + ret = cls_bpf_offload(tp, prog, oldprog, extack); if (ret) goto errout_parms; @@ -530,6 +523,8 @@ static int cls_bpf_change(struct net *net, struct sk_buff *in_skb, return 0; errout_parms: + if (bound_to_filter) + tcf_unbind_filter(tp, &prog->res); cls_bpf_free_parms(prog); errout_idr: if (!oldprog) -- cgit v1.2.3 From ac177a330077f264664f56259038e121bb214bec Mon Sep 17 00:00:00 2001 From: Victor Nogueira Date: Thu, 13 Jul 2023 15:05:14 -0300 Subject: net: sched: cls_flower: Undo tcf_bind_filter in case of an error If TCA_FLOWER_CLASSID is specified in the netlink message, the code will call tcf_bind_filter. However, if any error occurs after that, the code should undo this by calling tcf_unbind_filter. Fixes: 77b9900ef53a ("tc: introduce Flower classifier") Signed-off-by: Victor Nogueira Acked-by: Jamal Hadi Salim Reviewed-by: Pedro Tammela Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- net/sched/cls_flower.c | 99 ++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 52 deletions(-) diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index f2b0bc4142fe..8da9d039d964 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -2173,53 +2173,6 @@ static bool fl_needs_tc_skb_ext(const struct fl_flow_key *mask) return mask->meta.l2_miss; } -static int fl_set_parms(struct net *net, struct tcf_proto *tp, - struct cls_fl_filter *f, struct fl_flow_mask *mask, - unsigned long base, struct nlattr **tb, - struct nlattr *est, - struct fl_flow_tmplt *tmplt, - u32 flags, u32 fl_flags, - struct netlink_ext_ack *extack) -{ - int err; - - err = tcf_exts_validate_ex(net, tp, tb, est, &f->exts, flags, - fl_flags, extack); - if (err < 0) - return err; - - if (tb[TCA_FLOWER_CLASSID]) { - f->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); - if (flags & TCA_ACT_FLAGS_NO_RTNL) - rtnl_lock(); - tcf_bind_filter(tp, &f->res, base); - if (flags & TCA_ACT_FLAGS_NO_RTNL) - rtnl_unlock(); - } - - err = fl_set_key(net, tb, &f->key, &mask->key, extack); - if (err) - return err; - - fl_mask_update_range(mask); - fl_set_masked_key(&f->mkey, &f->key, mask); - - if (!fl_mask_fits_tmplt(tmplt, mask)) { - NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template"); - return -EINVAL; - } - - /* Enable tc skb extension if filter matches on data extracted from - * this extension. - */ - if (fl_needs_tc_skb_ext(&mask->key)) { - f->needs_tc_skb_ext = 1; - tc_skb_ext_tc_enable(); - } - - return 0; -} - static int fl_ht_insert_unique(struct cls_fl_filter *fnew, struct cls_fl_filter *fold, bool *in_ht) @@ -2251,6 +2204,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, struct cls_fl_head *head = fl_head_dereference(tp); bool rtnl_held = !(flags & TCA_ACT_FLAGS_NO_RTNL); struct cls_fl_filter *fold = *arg; + bool bound_to_filter = false; struct cls_fl_filter *fnew; struct fl_flow_mask *mask; struct nlattr **tb; @@ -2335,15 +2289,46 @@ static int fl_change(struct net *net, struct sk_buff *in_skb, if (err < 0) goto errout_idr; - err = fl_set_parms(net, tp, fnew, mask, base, tb, tca[TCA_RATE], - tp->chain->tmplt_priv, flags, fnew->flags, - extack); - if (err) + err = tcf_exts_validate_ex(net, tp, tb, tca[TCA_RATE], + &fnew->exts, flags, fnew->flags, + extack); + if (err < 0) goto errout_idr; + if (tb[TCA_FLOWER_CLASSID]) { + fnew->res.classid = nla_get_u32(tb[TCA_FLOWER_CLASSID]); + if (flags & TCA_ACT_FLAGS_NO_RTNL) + rtnl_lock(); + tcf_bind_filter(tp, &fnew->res, base); + if (flags & TCA_ACT_FLAGS_NO_RTNL) + rtnl_unlock(); + bound_to_filter = true; + } + + err = fl_set_key(net, tb, &fnew->key, &mask->key, extack); + if (err) + goto unbind_filter; + + fl_mask_update_range(mask); + fl_set_masked_key(&fnew->mkey, &fnew->key, mask); + + if (!fl_mask_fits_tmplt(tp->chain->tmplt_priv, mask)) { + NL_SET_ERR_MSG_MOD(extack, "Mask does not fit the template"); + err = -EINVAL; + goto unbind_filter; + } + + /* Enable tc skb extension if filter matches on data extracted from + * this extension. + */ + if (fl_needs_tc_skb_ext(&mask->key)) { + fnew->needs_tc_skb_ext = 1; + tc_skb_ext_tc_enable(); + } + err = fl_check_assign_mask(head, fnew, fold, mask); if (err) - goto errout_idr; + goto unbind_filter; err = fl_ht_insert_unique(fnew, fold, &in_ht); if (err) @@ -2434,6 +2419,16 @@ errout_hw: fnew->mask->filter_ht_params); errout_mask: fl_mask_put(head, fnew->mask); + +unbind_filter: + if (bound_to_filter) { + if (flags & TCA_ACT_FLAGS_NO_RTNL) + rtnl_lock(); + tcf_unbind_filter(tp, &fnew->res); + if (flags & TCA_ACT_FLAGS_NO_RTNL) + rtnl_unlock(); + } + errout_idr: if (!fold) idr_remove(&head->handle_idr, fnew->handle); -- cgit v1.2.3 From 4bdf79d686b49ac49373b36466acfb93972c7d7c Mon Sep 17 00:00:00 2001 From: Tristram Ha Date: Thu, 13 Jul 2023 17:46:22 -0700 Subject: net: dsa: microchip: correct KSZ8795 static MAC table access The KSZ8795 driver code was modified to use on KSZ8863/73, which has different register definitions. Some of the new KSZ8795 register information are wrong compared to previous code. KSZ8795 also behaves differently in that the STATIC_MAC_TABLE_USE_FID and STATIC_MAC_TABLE_FID bits are off by 1 when doing MAC table reading than writing. To compensate that a special code was added to shift the register value by 1 before applying those bits. This is wrong when the code is running on KSZ8863, so this special code is only executed when KSZ8795 is detected. Fixes: 4b20a07e103f ("net: dsa: microchip: ksz8795: add support for ksz88xx chips") Signed-off-by: Tristram Ha Reviewed-by: Horatiu Vultur Reviewed-by: Simon Horman Signed-off-by: David S. Miller --- drivers/net/dsa/microchip/ksz8795.c | 8 +++++++- drivers/net/dsa/microchip/ksz_common.c | 8 ++++---- drivers/net/dsa/microchip/ksz_common.h | 7 +++++++ 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/microchip/ksz8795.c b/drivers/net/dsa/microchip/ksz8795.c index 84d502589f8e..91aba470fb2f 100644 --- a/drivers/net/dsa/microchip/ksz8795.c +++ b/drivers/net/dsa/microchip/ksz8795.c @@ -506,7 +506,13 @@ static int ksz8_r_sta_mac_table(struct ksz_device *dev, u16 addr, (data_hi & masks[STATIC_MAC_TABLE_FWD_PORTS]) >> shifts[STATIC_MAC_FWD_PORTS]; alu->is_override = (data_hi & masks[STATIC_MAC_TABLE_OVERRIDE]) ? 1 : 0; - data_hi >>= 1; + + /* KSZ8795 family switches have STATIC_MAC_TABLE_USE_FID and + * STATIC_MAC_TABLE_FID definitions off by 1 when doing read on the + * static MAC table compared to doing write. + */ + if (ksz_is_ksz87xx(dev)) + data_hi >>= 1; alu->is_static = true; alu->is_use_fid = (data_hi & masks[STATIC_MAC_TABLE_USE_FID]) ? 1 : 0; alu->fid = (data_hi & masks[STATIC_MAC_TABLE_FID]) >> diff --git a/drivers/net/dsa/microchip/ksz_common.c b/drivers/net/dsa/microchip/ksz_common.c index 813b91a816bb..b18cd170ec06 100644 --- a/drivers/net/dsa/microchip/ksz_common.c +++ b/drivers/net/dsa/microchip/ksz_common.c @@ -331,13 +331,13 @@ static const u32 ksz8795_masks[] = { [STATIC_MAC_TABLE_VALID] = BIT(21), [STATIC_MAC_TABLE_USE_FID] = BIT(23), [STATIC_MAC_TABLE_FID] = GENMASK(30, 24), - [STATIC_MAC_TABLE_OVERRIDE] = BIT(26), - [STATIC_MAC_TABLE_FWD_PORTS] = GENMASK(24, 20), + [STATIC_MAC_TABLE_OVERRIDE] = BIT(22), + [STATIC_MAC_TABLE_FWD_PORTS] = GENMASK(20, 16), [DYNAMIC_MAC_TABLE_ENTRIES_H] = GENMASK(6, 0), - [DYNAMIC_MAC_TABLE_MAC_EMPTY] = BIT(8), + [DYNAMIC_MAC_TABLE_MAC_EMPTY] = BIT(7), [DYNAMIC_MAC_TABLE_NOT_READY] = BIT(7), [DYNAMIC_MAC_TABLE_ENTRIES] = GENMASK(31, 29), - [DYNAMIC_MAC_TABLE_FID] = GENMASK(26, 20), + [DYNAMIC_MAC_TABLE_FID] = GENMASK(22, 16), [DYNAMIC_MAC_TABLE_SRC_PORT] = GENMASK(26, 24), [DYNAMIC_MAC_TABLE_TIMESTAMP] = GENMASK(28, 27), [P_MII_TX_FLOW_CTRL] = BIT(5), diff --git a/drivers/net/dsa/microchip/ksz_common.h b/drivers/net/dsa/microchip/ksz_common.h index 28444e5924f9..a4de58847dea 100644 --- a/drivers/net/dsa/microchip/ksz_common.h +++ b/drivers/net/dsa/microchip/ksz_common.h @@ -601,6 +601,13 @@ static inline void ksz_regmap_unlock(void *__mtx) mutex_unlock(mtx); } +static inline bool ksz_is_ksz87xx(struct ksz_device *dev) +{ + return dev->chip_id == KSZ8795_CHIP_ID || + dev->chip_id == KSZ8794_CHIP_ID || + dev->chip_id == KSZ8765_CHIP_ID; +} + static inline bool ksz_is_ksz88x3(struct ksz_device *dev) { return dev->chip_id == KSZ8830_CHIP_ID; -- cgit v1.2.3 From 162d626f3013215b82b6514ca14f20932c7ccce5 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Fri, 14 Jul 2023 07:39:36 +0200 Subject: r8169: fix ASPM-related problem for chip version 42 and 43 Referenced commit missed that for chip versions 42 and 43 ASPM remained disabled in the respective rtl_hw_start_...() routines. This resulted in problems as described in the referenced bug ticket. Therefore re-instantiate the previous logic. Fixes: 5fc3f6c90cca ("r8169: consolidate disabling ASPM before EPHY access") Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217635 Signed-off-by: Heiner Kallweit Signed-off-by: David S. Miller --- drivers/net/ethernet/realtek/r8169_main.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 9445f04f8d48..fce4a2b908c2 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -2747,6 +2747,13 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) return; if (enable) { + /* On these chip versions ASPM can even harm + * bus communication of other PCI devices. + */ + if (tp->mac_version == RTL_GIGA_MAC_VER_42 || + tp->mac_version == RTL_GIGA_MAC_VER_43) + return; + rtl_mod_config5(tp, 0, ASPM_en); rtl_mod_config2(tp, 0, ClkReqEn); -- cgit v1.2.3 From ee8b94c8510ce64afe0b87ef548d23e00915fb10 Mon Sep 17 00:00:00 2001 From: Ziyang Xuan Date: Tue, 11 Jul 2023 09:17:37 +0800 Subject: can: raw: fix receiver memory leak Got kmemleak errors with the following ltp can_filter testcase: for ((i=1; i<=100; i++)) do ./can_filter & sleep 0.1 done ============================================================== [<00000000db4a4943>] can_rx_register+0x147/0x360 [can] [<00000000a289549d>] raw_setsockopt+0x5ef/0x853 [can_raw] [<000000006d3d9ebd>] __sys_setsockopt+0x173/0x2c0 [<00000000407dbfec>] __x64_sys_setsockopt+0x61/0x70 [<00000000fd468496>] do_syscall_64+0x33/0x40 [<00000000b7e47d51>] entry_SYSCALL_64_after_hwframe+0x61/0xc6 It's a bug in the concurrent scenario of unregister_netdevice_many() and raw_release() as following: cpu0 cpu1 unregister_netdevice_many(can_dev) unlist_netdevice(can_dev) // dev_get_by_index() return NULL after this net_set_todo(can_dev) raw_release(can_socket) dev = dev_get_by_index(, ro->ifindex); // dev == NULL if (dev) { // receivers in dev_rcv_lists not free because dev is NULL raw_disable_allfilters(, dev, ); dev_put(dev); } ... ro->bound = 0; ... call_netdevice_notifiers(NETDEV_UNREGISTER, ) raw_notify(, NETDEV_UNREGISTER, ) if (ro->bound) // invalid because ro->bound has been set 0 raw_disable_allfilters(, dev, ); // receivers in dev_rcv_lists will never be freed Add a net_device pointer member in struct raw_sock to record bound can_dev, and use rtnl_lock to serialize raw_socket members between raw_bind(), raw_release(), raw_setsockopt() and raw_notify(). Use ro->dev to decide whether to free receivers in dev_rcv_lists. Fixes: 8d0caedb7596 ("can: bcm/raw/isotp: use per module netdevice notifier") Reviewed-by: Oliver Hartkopp Acked-by: Oliver Hartkopp Signed-off-by: Ziyang Xuan Link: https://lore.kernel.org/all/20230711011737.1969582-1-william.xuanziyang@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/raw.c | 57 ++++++++++++++++++++++++--------------------------------- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/net/can/raw.c b/net/can/raw.c index 15c79b079184..2302e4882967 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -84,6 +84,7 @@ struct raw_sock { struct sock sk; int bound; int ifindex; + struct net_device *dev; struct list_head notifier; int loopback; int recv_own_msgs; @@ -277,7 +278,7 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg, if (!net_eq(dev_net(dev), sock_net(sk))) return; - if (ro->ifindex != dev->ifindex) + if (ro->dev != dev) return; switch (msg) { @@ -292,6 +293,7 @@ static void raw_notify(struct raw_sock *ro, unsigned long msg, ro->ifindex = 0; ro->bound = 0; + ro->dev = NULL; ro->count = 0; release_sock(sk); @@ -337,6 +339,7 @@ static int raw_init(struct sock *sk) ro->bound = 0; ro->ifindex = 0; + ro->dev = NULL; /* set default filter to single entry dfilter */ ro->dfilter.can_id = 0; @@ -385,19 +388,13 @@ static int raw_release(struct socket *sock) lock_sock(sk); + rtnl_lock(); /* remove current filters & unregister */ if (ro->bound) { - if (ro->ifindex) { - struct net_device *dev; - - dev = dev_get_by_index(sock_net(sk), ro->ifindex); - if (dev) { - raw_disable_allfilters(dev_net(dev), dev, sk); - dev_put(dev); - } - } else { + if (ro->dev) + raw_disable_allfilters(dev_net(ro->dev), ro->dev, sk); + else raw_disable_allfilters(sock_net(sk), NULL, sk); - } } if (ro->count > 1) @@ -405,8 +402,10 @@ static int raw_release(struct socket *sock) ro->ifindex = 0; ro->bound = 0; + ro->dev = NULL; ro->count = 0; free_percpu(ro->uniq); + rtnl_unlock(); sock_orphan(sk); sock->sk = NULL; @@ -422,6 +421,7 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct raw_sock *ro = raw_sk(sk); + struct net_device *dev = NULL; int ifindex; int err = 0; int notify_enetdown = 0; @@ -431,14 +431,13 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (addr->can_family != AF_CAN) return -EINVAL; + rtnl_lock(); lock_sock(sk); if (ro->bound && addr->can_ifindex == ro->ifindex) goto out; if (addr->can_ifindex) { - struct net_device *dev; - dev = dev_get_by_index(sock_net(sk), addr->can_ifindex); if (!dev) { err = -ENODEV; @@ -467,26 +466,20 @@ static int raw_bind(struct socket *sock, struct sockaddr *uaddr, int len) if (!err) { if (ro->bound) { /* unregister old filters */ - if (ro->ifindex) { - struct net_device *dev; - - dev = dev_get_by_index(sock_net(sk), - ro->ifindex); - if (dev) { - raw_disable_allfilters(dev_net(dev), - dev, sk); - dev_put(dev); - } - } else { + if (ro->dev) + raw_disable_allfilters(dev_net(ro->dev), + ro->dev, sk); + else raw_disable_allfilters(sock_net(sk), NULL, sk); - } } ro->ifindex = ifindex; ro->bound = 1; + ro->dev = dev; } out: release_sock(sk); + rtnl_unlock(); if (notify_enetdown) { sk->sk_err = ENETDOWN; @@ -553,9 +546,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) { - dev = dev_get_by_index(sock_net(sk), ro->ifindex); - if (!dev) { + dev = ro->dev; + if (ro->bound && dev) { + if (dev->reg_state != NETREG_REGISTERED) { if (count > 1) kfree(filter); err = -ENODEV; @@ -596,7 +589,6 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->count = count; out_fil: - dev_put(dev); release_sock(sk); rtnl_unlock(); @@ -614,9 +606,9 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, rtnl_lock(); lock_sock(sk); - if (ro->bound && ro->ifindex) { - dev = dev_get_by_index(sock_net(sk), ro->ifindex); - if (!dev) { + dev = ro->dev; + if (ro->bound && dev) { + if (dev->reg_state != NETREG_REGISTERED) { err = -ENODEV; goto out_err; } @@ -640,7 +632,6 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, ro->err_mask = err_mask; out_err: - dev_put(dev); release_sock(sk); rtnl_unlock(); -- cgit v1.2.3 From 55c3b96074f3f9b0aee19bf93cd71af7516582bb Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Sat, 15 Jul 2023 17:25:43 +0800 Subject: can: bcm: Fix UAF in bcm_proc_show() BUG: KASAN: slab-use-after-free in bcm_proc_show+0x969/0xa80 Read of size 8 at addr ffff888155846230 by task cat/7862 CPU: 1 PID: 7862 Comm: cat Not tainted 6.5.0-rc1-00153-gc8746099c197 #230 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 Call Trace: dump_stack_lvl+0xd5/0x150 print_report+0xc1/0x5e0 kasan_report+0xba/0xf0 bcm_proc_show+0x969/0xa80 seq_read_iter+0x4f6/0x1260 seq_read+0x165/0x210 proc_reg_read+0x227/0x300 vfs_read+0x1d5/0x8d0 ksys_read+0x11e/0x240 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd Allocated by task 7846: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 __kasan_kmalloc+0x9e/0xa0 bcm_sendmsg+0x264b/0x44e0 sock_sendmsg+0xda/0x180 ____sys_sendmsg+0x735/0x920 ___sys_sendmsg+0x11d/0x1b0 __sys_sendmsg+0xfa/0x1d0 do_syscall_64+0x35/0xb0 entry_SYSCALL_64_after_hwframe+0x63/0xcd Freed by task 7846: kasan_save_stack+0x1e/0x40 kasan_set_track+0x21/0x30 kasan_save_free_info+0x27/0x40 ____kasan_slab_free+0x161/0x1c0 slab_free_freelist_hook+0x119/0x220 __kmem_cache_free+0xb4/0x2e0 rcu_core+0x809/0x1bd0 bcm_op is freed before procfs entry be removed in bcm_release(), this lead to bcm_proc_show() may read the freed bcm_op. Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Signed-off-by: YueHaibing Reviewed-by: Oliver Hartkopp Acked-by: Oliver Hartkopp Link: https://lore.kernel.org/all/20230715092543.15548-1-yuehaibing@huawei.com Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde --- net/can/bcm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index 9ba35685b043..9168114fc87f 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1526,6 +1526,12 @@ static int bcm_release(struct socket *sock) lock_sock(sk); +#if IS_ENABLED(CONFIG_PROC_FS) + /* remove procfs entry */ + if (net->can.bcmproc_dir && bo->bcm_proc_read) + remove_proc_entry(bo->procname, net->can.bcmproc_dir); +#endif /* CONFIG_PROC_FS */ + list_for_each_entry_safe(op, next, &bo->tx_ops, list) bcm_remove_op(op); @@ -1561,12 +1567,6 @@ static int bcm_release(struct socket *sock) list_for_each_entry_safe(op, next, &bo->rx_ops, list) bcm_remove_op(op); -#if IS_ENABLED(CONFIG_PROC_FS) - /* remove procfs entry */ - if (net->can.bcmproc_dir && bo->bcm_proc_read) - remove_proc_entry(bo->procname, net->can.bcmproc_dir); -#endif /* CONFIG_PROC_FS */ - /* remove device reference */ if (bo->bound) { bo->bound = 0; -- cgit v1.2.3 From 2603be9e8167ddc7bea95dcfab9ffc33414215aa Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 7 Jul 2023 13:43:10 +0200 Subject: can: gs_usb: gs_can_open(): improve error handling The gs_usb driver handles USB devices with more than 1 CAN channel. The RX path for all channels share the same bulk endpoint (the transmitted bulk data encodes the channel number). These per-device resources are allocated and submitted by the first opened channel. During this allocation, the resources are either released immediately in case of a failure or the URBs are anchored. All anchored URBs are finally killed with gs_usb_disconnect(). Currently, gs_can_open() returns with an error if the allocation of a URB or a buffer fails. However, if usb_submit_urb() fails, the driver continues with the URBs submitted so far, even if no URBs were successfully submitted. Treat every error as fatal and free all allocated resources immediately. Switch to goto-style error handling, to prepare the driver for more per-device resource allocation. Cc: stable@vger.kernel.org Cc: John Whittington Link: https://lore.kernel.org/all/20230716-gs_usb-fix-time-stamp-counter-v1-1-9017cefcd9d5@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index d476c2884008..85b7b59c8426 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -833,6 +833,7 @@ static int gs_can_open(struct net_device *netdev) .mode = cpu_to_le32(GS_CAN_MODE_START), }; struct gs_host_frame *hf; + struct urb *urb = NULL; u32 ctrlmode; u32 flags = 0; int rc, i; @@ -856,13 +857,14 @@ static int gs_can_open(struct net_device *netdev) if (!parent->active_channels) { for (i = 0; i < GS_MAX_RX_URBS; i++) { - struct urb *urb; u8 *buf; /* alloc rx urb */ urb = usb_alloc_urb(0, GFP_KERNEL); - if (!urb) - return -ENOMEM; + if (!urb) { + rc = -ENOMEM; + goto out_usb_kill_anchored_urbs; + } /* alloc rx buffer */ buf = kmalloc(dev->parent->hf_size_rx, @@ -870,8 +872,8 @@ static int gs_can_open(struct net_device *netdev) if (!buf) { netdev_err(netdev, "No memory left for USB buffer\n"); - usb_free_urb(urb); - return -ENOMEM; + rc = -ENOMEM; + goto out_usb_free_urb; } /* fill, anchor, and submit rx urb */ @@ -894,9 +896,7 @@ static int gs_can_open(struct net_device *netdev) netdev_err(netdev, "usb_submit failed (err=%d)\n", rc); - usb_unanchor_urb(urb); - usb_free_urb(urb); - break; + goto out_usb_unanchor_urb; } /* Drop reference, @@ -945,7 +945,8 @@ static int gs_can_open(struct net_device *netdev) if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) gs_usb_timestamp_stop(dev); dev->can.state = CAN_STATE_STOPPED; - return rc; + + goto out_usb_kill_anchored_urbs; } parent->active_channels++; @@ -953,6 +954,18 @@ static int gs_can_open(struct net_device *netdev) netif_start_queue(netdev); return 0; + +out_usb_unanchor_urb: + usb_unanchor_urb(urb); +out_usb_free_urb: + usb_free_urb(urb); +out_usb_kill_anchored_urbs: + if (!parent->active_channels) + usb_kill_anchored_urbs(&dev->tx_submitted); + + close_candev(netdev); + + return rc; } static int gs_usb_get_state(const struct net_device *netdev, -- cgit v1.2.3 From 5886e4d5ecec3e22844efed90b2dd383ef804b3a Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 7 Jul 2023 18:44:23 +0200 Subject: can: gs_usb: fix time stamp counter initialization If the gs_usb device driver is unloaded (or unbound) before the interface is shut down, the USB stack first calls the struct usb_driver::disconnect and then the struct net_device_ops::ndo_stop callback. In gs_usb_disconnect() all pending bulk URBs are killed, i.e. no more RX'ed CAN frames are send from the USB device to the host. Later in gs_can_close() a reset control message is send to each CAN channel to remove the controller from the CAN bus. In this race window the USB device can still receive CAN frames from the bus and internally queue them to be send to the host. At least in the current version of the candlelight firmware, the queue of received CAN frames is not emptied during the reset command. After loading (or binding) the gs_usb driver, new URBs are submitted during the struct net_device_ops::ndo_open callback and the candlelight firmware starts sending its already queued CAN frames to the host. However, this scenario was not considered when implementing the hardware timestamp function. The cycle counter/time counter infrastructure is set up (gs_usb_timestamp_init()) after the USBs are submitted, resulting in a NULL pointer dereference if timecounter_cyc2time() (via the call chain: gs_usb_receive_bulk_callback() -> gs_usb_set_timestamp() -> gs_usb_skb_set_timestamp()) is called too early. Move the gs_usb_timestamp_init() function before the URBs are submitted to fix this problem. For a comprehensive solution, we need to consider gs_usb devices with more than 1 channel. The cycle counter/time counter infrastructure is setup per channel, but the RX URBs are per device. Once gs_can_open() of _a_ channel has been called, and URBs have been submitted, the gs_usb_receive_bulk_callback() can be called for _all_ available channels, even for channels that are not running, yet. As cycle counter/time counter has not set up, this will again lead to a NULL pointer dereference. Convert the cycle counter/time counter from a "per channel" to a "per device" functionality. Also set it up, before submitting any URBs to the device. Further in gs_usb_receive_bulk_callback(), don't process any URBs for not started CAN channels, only resubmit the URB. Fixes: 45dfa45f52e6 ("can: gs_usb: add RX and TX hardware timestamp support") Closes: https://github.com/candle-usb/candleLight_fw/issues/137#issuecomment-1623532076 Cc: stable@vger.kernel.org Cc: John Whittington Link: https://lore.kernel.org/all/20230716-gs_usb-fix-time-stamp-counter-v1-2-9017cefcd9d5@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/usb/gs_usb.c | 101 +++++++++++++++++++++++-------------------- 1 file changed, 53 insertions(+), 48 deletions(-) diff --git a/drivers/net/can/usb/gs_usb.c b/drivers/net/can/usb/gs_usb.c index 85b7b59c8426..f418066569fc 100644 --- a/drivers/net/can/usb/gs_usb.c +++ b/drivers/net/can/usb/gs_usb.c @@ -303,12 +303,6 @@ struct gs_can { struct can_bittiming_const bt_const, data_bt_const; unsigned int channel; /* channel number */ - /* time counter for hardware timestamps */ - struct cyclecounter cc; - struct timecounter tc; - spinlock_t tc_lock; /* spinlock to guard access tc->cycle_last */ - struct delayed_work timestamp; - u32 feature; unsigned int hf_size_tx; @@ -325,6 +319,13 @@ struct gs_usb { struct gs_can *canch[GS_MAX_INTF]; struct usb_anchor rx_submitted; struct usb_device *udev; + + /* time counter for hardware timestamps */ + struct cyclecounter cc; + struct timecounter tc; + spinlock_t tc_lock; /* spinlock to guard access tc->cycle_last */ + struct delayed_work timestamp; + unsigned int hf_size_rx; u8 active_channels; }; @@ -388,15 +389,15 @@ static int gs_cmd_reset(struct gs_can *dev) GFP_KERNEL); } -static inline int gs_usb_get_timestamp(const struct gs_can *dev, +static inline int gs_usb_get_timestamp(const struct gs_usb *parent, u32 *timestamp_p) { __le32 timestamp; int rc; - rc = usb_control_msg_recv(dev->udev, 0, GS_USB_BREQ_TIMESTAMP, + rc = usb_control_msg_recv(parent->udev, 0, GS_USB_BREQ_TIMESTAMP, USB_DIR_IN | USB_TYPE_VENDOR | USB_RECIP_INTERFACE, - dev->channel, 0, + 0, 0, ×tamp, sizeof(timestamp), USB_CTRL_GET_TIMEOUT, GFP_KERNEL); @@ -410,20 +411,20 @@ static inline int gs_usb_get_timestamp(const struct gs_can *dev, static u64 gs_usb_timestamp_read(const struct cyclecounter *cc) __must_hold(&dev->tc_lock) { - struct gs_can *dev = container_of(cc, struct gs_can, cc); + struct gs_usb *parent = container_of(cc, struct gs_usb, cc); u32 timestamp = 0; int err; - lockdep_assert_held(&dev->tc_lock); + lockdep_assert_held(&parent->tc_lock); /* drop lock for synchronous USB transfer */ - spin_unlock_bh(&dev->tc_lock); - err = gs_usb_get_timestamp(dev, ×tamp); - spin_lock_bh(&dev->tc_lock); + spin_unlock_bh(&parent->tc_lock); + err = gs_usb_get_timestamp(parent, ×tamp); + spin_lock_bh(&parent->tc_lock); if (err) - netdev_err(dev->netdev, - "Error %d while reading timestamp. HW timestamps may be inaccurate.", - err); + dev_err(&parent->udev->dev, + "Error %d while reading timestamp. HW timestamps may be inaccurate.", + err); return timestamp; } @@ -431,14 +432,14 @@ static u64 gs_usb_timestamp_read(const struct cyclecounter *cc) __must_hold(&dev static void gs_usb_timestamp_work(struct work_struct *work) { struct delayed_work *delayed_work = to_delayed_work(work); - struct gs_can *dev; + struct gs_usb *parent; - dev = container_of(delayed_work, struct gs_can, timestamp); - spin_lock_bh(&dev->tc_lock); - timecounter_read(&dev->tc); - spin_unlock_bh(&dev->tc_lock); + parent = container_of(delayed_work, struct gs_usb, timestamp); + spin_lock_bh(&parent->tc_lock); + timecounter_read(&parent->tc); + spin_unlock_bh(&parent->tc_lock); - schedule_delayed_work(&dev->timestamp, + schedule_delayed_work(&parent->timestamp, GS_USB_TIMESTAMP_WORK_DELAY_SEC * HZ); } @@ -446,37 +447,38 @@ static void gs_usb_skb_set_timestamp(struct gs_can *dev, struct sk_buff *skb, u32 timestamp) { struct skb_shared_hwtstamps *hwtstamps = skb_hwtstamps(skb); + struct gs_usb *parent = dev->parent; u64 ns; - spin_lock_bh(&dev->tc_lock); - ns = timecounter_cyc2time(&dev->tc, timestamp); - spin_unlock_bh(&dev->tc_lock); + spin_lock_bh(&parent->tc_lock); + ns = timecounter_cyc2time(&parent->tc, timestamp); + spin_unlock_bh(&parent->tc_lock); hwtstamps->hwtstamp = ns_to_ktime(ns); } -static void gs_usb_timestamp_init(struct gs_can *dev) +static void gs_usb_timestamp_init(struct gs_usb *parent) { - struct cyclecounter *cc = &dev->cc; + struct cyclecounter *cc = &parent->cc; cc->read = gs_usb_timestamp_read; cc->mask = CYCLECOUNTER_MASK(32); cc->shift = 32 - bits_per(NSEC_PER_SEC / GS_USB_TIMESTAMP_TIMER_HZ); cc->mult = clocksource_hz2mult(GS_USB_TIMESTAMP_TIMER_HZ, cc->shift); - spin_lock_init(&dev->tc_lock); - spin_lock_bh(&dev->tc_lock); - timecounter_init(&dev->tc, &dev->cc, ktime_get_real_ns()); - spin_unlock_bh(&dev->tc_lock); + spin_lock_init(&parent->tc_lock); + spin_lock_bh(&parent->tc_lock); + timecounter_init(&parent->tc, &parent->cc, ktime_get_real_ns()); + spin_unlock_bh(&parent->tc_lock); - INIT_DELAYED_WORK(&dev->timestamp, gs_usb_timestamp_work); - schedule_delayed_work(&dev->timestamp, + INIT_DELAYED_WORK(&parent->timestamp, gs_usb_timestamp_work); + schedule_delayed_work(&parent->timestamp, GS_USB_TIMESTAMP_WORK_DELAY_SEC * HZ); } -static void gs_usb_timestamp_stop(struct gs_can *dev) +static void gs_usb_timestamp_stop(struct gs_usb *parent) { - cancel_delayed_work_sync(&dev->timestamp); + cancel_delayed_work_sync(&parent->timestamp); } static void gs_update_state(struct gs_can *dev, struct can_frame *cf) @@ -560,6 +562,9 @@ static void gs_usb_receive_bulk_callback(struct urb *urb) if (!netif_device_present(netdev)) return; + if (!netif_running(netdev)) + goto resubmit_urb; + if (hf->echo_id == -1) { /* normal rx */ if (hf->flags & GS_CAN_FLAG_FD) { skb = alloc_canfd_skb(dev->netdev, &cfd); @@ -856,6 +861,9 @@ static int gs_can_open(struct net_device *netdev) } if (!parent->active_channels) { + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) + gs_usb_timestamp_init(parent); + for (i = 0; i < GS_MAX_RX_URBS; i++) { u8 *buf; @@ -926,13 +934,9 @@ static int gs_can_open(struct net_device *netdev) flags |= GS_CAN_MODE_FD; /* if hardware supports timestamps, enable it */ - if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) { + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) flags |= GS_CAN_MODE_HW_TIMESTAMP; - /* start polling timestamp */ - gs_usb_timestamp_init(dev); - } - /* finally start device */ dev->can.state = CAN_STATE_ERROR_ACTIVE; dm.flags = cpu_to_le32(flags); @@ -942,8 +946,6 @@ static int gs_can_open(struct net_device *netdev) GFP_KERNEL); if (rc) { netdev_err(netdev, "Couldn't start device (err=%d)\n", rc); - if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) - gs_usb_timestamp_stop(dev); dev->can.state = CAN_STATE_STOPPED; goto out_usb_kill_anchored_urbs; @@ -960,9 +962,13 @@ out_usb_unanchor_urb: out_usb_free_urb: usb_free_urb(urb); out_usb_kill_anchored_urbs: - if (!parent->active_channels) + if (!parent->active_channels) { usb_kill_anchored_urbs(&dev->tx_submitted); + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) + gs_usb_timestamp_stop(parent); + } + close_candev(netdev); return rc; @@ -1011,14 +1017,13 @@ static int gs_can_close(struct net_device *netdev) netif_stop_queue(netdev); - /* stop polling timestamp */ - if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) - gs_usb_timestamp_stop(dev); - /* Stop polling */ parent->active_channels--; if (!parent->active_channels) { usb_kill_anchored_urbs(&parent->rx_submitted); + + if (dev->feature & GS_CAN_FEATURE_HW_TIMESTAMP) + gs_usb_timestamp_stop(parent); } /* Stop sending URBs */ -- cgit v1.2.3 From 2480232c61b888a160153b9022858ac8c4362d5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Fri, 14 Jul 2023 15:29:31 -0300 Subject: perf test task_exit: No need for a cycles event to check if we get an PERF_RECORD_EXIT The intent of this test is to check we get a PERF_RECORD_EXIT as asked for by setting perf_event_attr.task=1. When the test was written we didn't had the "dummy" event so we went with the default event, "cycles". There were reports of this test failing sometimes, one of these reports was with a PREEMPT_RT_FULL, but I noticed it failing sometimes with an aarch64 Firefly board. In the kernel the call to perf_event_task_output(), that generates the PERF_RECORD_EXIT may fail when there is not enough memory in the ring buffer, if the ring buffer is paused, etc. So switch to using the "dummy" event to use the ring buffer just for what the test was designed for, avoiding uneeded PERF_RECORD_SAMPLEs. Acked-by: Ian Rogers Cc: Adrian Hunter Cc: Jiri Olsa Cc: Juri Lelli Cc: Namhyung Kim Link: https://lore.kernel.org/lkml/ZLGXmMuNRpx1ubFm@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/tests/task-exit.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 25f075fa9125..968dddde6dda 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -58,9 +58,9 @@ static int test__task_exit(struct test_suite *test __maybe_unused, int subtest _ signal(SIGCHLD, sig_handler); - evlist = evlist__new_default(); + evlist = evlist__new_dummy(); if (evlist == NULL) { - pr_debug("evlist__new_default\n"); + pr_debug("evlist__new_dummy\n"); return -1; } -- cgit v1.2.3 From 371baf5c9750a258fee21d0cb8c8d683bb057429 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 10 Jul 2023 09:02:33 -0700 Subject: xfs: convert flex-array declarations in struct xfs_attrlist* As of 6.5-rc1, UBSAN trips over the attrlist ioctl definitions using an array length of 1 to pretend to be a flex array. Kernel compilers have to support unbounded array declarations, so let's correct this. This may cause friction with userspace header declarations, but suck is life. ================================================================================ UBSAN: array-index-out-of-bounds in fs/xfs/xfs_ioctl.c:345:18 index 1 is out of range for type '__s32 [1]' Call Trace: dump_stack_lvl+0x33/0x50 __ubsan_handle_out_of_bounds+0x9c/0xd0 xfs_ioc_attr_put_listent+0x413/0x420 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_attr_list_ilocked+0x170/0x850 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_attr_list+0xb7/0x120 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_ioc_attr_list+0x13b/0x2e0 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_attrlist_by_handle+0xab/0x120 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_file_ioctl+0x1ff/0x15e0 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] vfs_ioctl+0x1f/0x60 The kernel and xfsprogs code that uses these structures will not have problems, but the long tail of external user programs might. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Kees Cook --- fs/xfs/libxfs/xfs_fs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 9c60ebb328b4..2cbf9ea39b8c 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -592,12 +592,12 @@ typedef struct xfs_attrlist_cursor { struct xfs_attrlist { __s32 al_count; /* number of entries in attrlist */ __s32 al_more; /* T/F: more attrs (do call again) */ - __s32 al_offset[1]; /* byte offsets of attrs [var-sized] */ + __s32 al_offset[]; /* byte offsets of attrs [var-sized] */ }; struct xfs_attrlist_ent { /* data from attr_list() */ __u32 a_valuelen; /* number bytes in value of attr */ - char a_name[1]; /* attr name (NULL terminated) */ + char a_name[]; /* attr name (NULL terminated) */ }; typedef struct xfs_fsop_attrlist_handlereq { -- cgit v1.2.3 From a49bbce58ea90b14d4cb1d00681023a8606955f2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 10 Jul 2023 09:12:20 -0700 Subject: xfs: convert flex-array declarations in xfs attr leaf blocks As of 6.5-rc1, UBSAN trips over the ondisk extended attribute leaf block definitions using an array length of 1 to pretend to be a flex array. Kernel compilers have to support unbounded array declarations, so let's correct this. ================================================================================ UBSAN: array-index-out-of-bounds in fs/xfs/libxfs/xfs_attr_leaf.c:2535:24 index 2 is out of range for type '__u8 [1]' Call Trace: dump_stack_lvl+0x33/0x50 __ubsan_handle_out_of_bounds+0x9c/0xd0 xfs_attr3_leaf_getvalue+0x2ce/0x2e0 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_attr_leaf_get+0x148/0x1c0 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_attr_get_ilocked+0xae/0x110 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_attr_get+0xee/0x150 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] xfs_xattr_get+0x7d/0xc0 [xfs 4a986a89a77bb77402ab8a87a37da369ef6a3f09] __vfs_getxattr+0xa3/0x100 vfs_getxattr+0x87/0x1d0 do_getxattr+0x17a/0x220 getxattr+0x89/0xf0 Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Kees Cook --- fs/xfs/libxfs/xfs_da_format.h | 73 ++++++++++++++++++++++++++++++++++++++----- fs/xfs/xfs_ondisk.h | 4 +-- 2 files changed, 67 insertions(+), 10 deletions(-) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index 25e2841084e1..b2362717c42e 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -620,19 +620,29 @@ typedef struct xfs_attr_leaf_entry { /* sorted on key, not name */ typedef struct xfs_attr_leaf_name_local { __be16 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 nameval[1]; /* name/value bytes */ + /* + * In Linux 6.5 this flex array was converted from nameval[1] to + * nameval[]. Be very careful here about extra padding at the end; + * see xfs_attr_leaf_entsize_local() for details. + */ + __u8 nameval[]; /* name/value bytes */ } xfs_attr_leaf_name_local_t; typedef struct xfs_attr_leaf_name_remote { __be32 valueblk; /* block number of value bytes */ __be32 valuelen; /* number of bytes in value */ __u8 namelen; /* length of name bytes */ - __u8 name[1]; /* name bytes */ + /* + * In Linux 6.5 this flex array was converted from name[1] to name[]. + * Be very careful here about extra padding at the end; see + * xfs_attr_leaf_entsize_remote() for details. + */ + __u8 name[]; /* name bytes */ } xfs_attr_leaf_name_remote_t; typedef struct xfs_attr_leafblock { xfs_attr_leaf_hdr_t hdr; /* constant-structure header block */ - xfs_attr_leaf_entry_t entries[1]; /* sorted on key, not name */ + xfs_attr_leaf_entry_t entries[]; /* sorted on key, not name */ /* * The rest of the block contains the following structures after the * leaf entries, growing from the bottom up. The variables are never @@ -664,7 +674,7 @@ struct xfs_attr3_leaf_hdr { struct xfs_attr3_leafblock { struct xfs_attr3_leaf_hdr hdr; - struct xfs_attr_leaf_entry entries[1]; + struct xfs_attr_leaf_entry entries[]; /* * The rest of the block contains the following structures after the @@ -747,14 +757,61 @@ xfs_attr3_leaf_name_local(xfs_attr_leafblock_t *leafp, int idx) */ static inline int xfs_attr_leaf_entsize_remote(int nlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_remote) - 1 + - nlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_remote ended with + * name[1], which was used as a flexarray. The layout of this struct + * is 9 bytes of fixed-length fields followed by a __u8 flex array at + * offset 9. + * + * On most architectures, struct xfs_attr_leaf_name_remote had two + * bytes of implicit padding at the end of the struct to make the + * struct length 12. After converting name[1] to name[], there are + * three implicit padding bytes and the struct size remains 12. + * However, there are compiler configurations that do not add implicit + * padding at all (m68k) and have been broken for years. + * + * This entsize computation historically added (the xattr name length) + * to (the padded struct length - 1) and rounded that sum up to the + * nearest multiple of 4 (NAME_ALIGN). IOWs, round_up(11 + nlen, 4). + * This is encoded in the ondisk format, so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t remotesize = + offsetof(struct xfs_attr_leaf_name_remote, name) + 2; + + return round_up(remotesize + nlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local(int nlen, int vlen) { - return round_up(sizeof(struct xfs_attr_leaf_name_local) - 1 + - nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); + /* + * Prior to Linux 6.5, struct xfs_attr_leaf_name_local ended with + * nameval[1], which was used as a flexarray. The layout of this + * struct is 3 bytes of fixed-length fields followed by a __u8 flex + * array at offset 3. + * + * struct xfs_attr_leaf_name_local had zero bytes of implicit padding + * at the end of the struct to make the struct length 4. On most + * architectures, after converting nameval[1] to nameval[], there is + * one implicit padding byte and the struct size remains 4. However, + * there are compiler configurations that do not add implicit padding + * at all (m68k) and would break. + * + * This entsize computation historically added (the xattr name and + * value length) to (the padded struct length - 1) and rounded that sum + * up to the nearest multiple of 4 (NAME_ALIGN). IOWs, the formula is + * round_up(3 + nlen + vlen, 4). This is encoded in the ondisk format, + * so we cannot change this. + * + * Compute the entsize from offsetof of the flexarray and manually + * adding bytes for the implicit padding. + */ + const size_t localsize = + offsetof(struct xfs_attr_leaf_name_local, nameval); + + return round_up(localsize + nlen + vlen, XFS_ATTR_LEAF_NAME_ALIGN); } static inline int xfs_attr_leaf_entsize_local_max(int bsize) diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 9737b5a9f405..37be297f2532 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -56,7 +56,7 @@ xfs_check_ondisk_structs(void) /* dir/attr trees */ XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leaf_hdr, 80); - XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 88); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_leafblock, 80); XFS_CHECK_STRUCT_SIZE(struct xfs_attr3_rmt_hdr, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_blkinfo, 56); XFS_CHECK_STRUCT_SIZE(struct xfs_da3_intnode, 64); @@ -88,7 +88,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, valuelen, 4); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); - XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 40); + XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); -- cgit v1.2.3 From f6250e205691a58c81be041b1809a2e706852641 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 10 Jul 2023 09:32:11 -0700 Subject: xfs: convert flex-array declarations in xfs attr shortform objects As of 6.5-rc1, UBSAN trips over the ondisk extended attribute shortform definitions using an array length of 1 to pretend to be a flex array. Kernel compilers have to support unbounded array declarations, so let's correct this. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Kees Cook --- fs/xfs/libxfs/xfs_da_format.h | 2 +- fs/xfs/xfs_ondisk.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_da_format.h b/fs/xfs/libxfs/xfs_da_format.h index b2362717c42e..f9015f88eca7 100644 --- a/fs/xfs/libxfs/xfs_da_format.h +++ b/fs/xfs/libxfs/xfs_da_format.h @@ -591,7 +591,7 @@ struct xfs_attr_shortform { uint8_t valuelen; /* actual length of value (no NULL) */ uint8_t flags; /* flags bits (see xfs_attr_leaf.h) */ uint8_t nameval[]; /* name & value bytes concatenated */ - } list[1]; /* variable sized array */ + } list[]; /* variable sized array */ }; typedef struct xfs_attr_leaf_map { /* RLE map of free bytes */ diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h index 37be297f2532..c4cc99b70dd3 100644 --- a/fs/xfs/xfs_ondisk.h +++ b/fs/xfs/xfs_ondisk.h @@ -89,6 +89,7 @@ xfs_check_ondisk_structs(void) XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, namelen, 8); XFS_CHECK_OFFSET(xfs_attr_leaf_name_remote_t, name, 9); XFS_CHECK_STRUCT_SIZE(xfs_attr_leafblock_t, 32); + XFS_CHECK_STRUCT_SIZE(struct xfs_attr_shortform, 4); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.totsize, 0); XFS_CHECK_OFFSET(struct xfs_attr_shortform, hdr.count, 2); XFS_CHECK_OFFSET(struct xfs_attr_shortform, list[0].namelen, 4); -- cgit v1.2.3 From 20c64ec83a9f779a750bbbcc1d07d065702313a5 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jul 2023 08:49:57 -0700 Subject: iomap: fix a regression for partial write errors When write* wrote some data it should return the amount of written data and not the error code that caused it to stop. Fix a recent regression in iomap_file_buffered_write that caused it to return the errno instead. Fixes: 219580eea1ee ("iomap: update ki_pos in iomap_file_buffered_write") Reported-by: kernel test robot Reported-by: Cyril Hrubis Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Ritesh Harjani (IBM) --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index adb92cdb24b0..7cc9f7274883 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -872,7 +872,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_write_iter(&iter, i); - if (unlikely(ret < 0)) + if (unlikely(iter.pos == iocb->ki_pos)) return ret; ret = iter.pos - iocb->ki_pos; iocb->ki_pos += ret; -- cgit v1.2.3 From efa96cc99793bafe96bdbff6abab94d81472a32d Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 17 Jul 2023 08:49:57 -0700 Subject: iomap: micro optimize the ki_pos assignment in iomap_file_buffered_write We have the new value for ki_pos right at hand in iter.pos, so assign that instead of recalculating it from ret. Signed-off-by: Christoph Hellwig Reviewed-by: Darrick J. Wong Signed-off-by: Darrick J. Wong Reviewed-by: Ritesh Harjani (IBM) --- fs/iomap/buffered-io.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 7cc9f7274883..aa8967cca1a3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -875,7 +875,7 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (unlikely(iter.pos == iocb->ki_pos)) return ret; ret = iter.pos - iocb->ki_pos; - iocb->ki_pos += ret; + iocb->ki_pos = iter.pos; return ret; } EXPORT_SYMBOL_GPL(iomap_file_buffered_write); -- cgit v1.2.3 From 5f4fa1672d98fe99d2297b03add35346f1685d6b Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Tue, 9 May 2023 19:11:47 +0800 Subject: iavf: Fix use-after-free in free_netdev We do netif_napi_add() for all allocated q_vectors[], but potentially do netif_napi_del() for part of them, then kfree q_vectors and leave invalid pointers at dev->napi_list. Reproducer: [root@host ~]# cat repro.sh #!/bin/bash pf_dbsf="0000:41:00.0" vf0_dbsf="0000:41:02.0" g_pids=() function do_set_numvf() { echo 2 >/sys/bus/pci/devices/${pf_dbsf}/sriov_numvfs sleep $((RANDOM%3+1)) echo 0 >/sys/bus/pci/devices/${pf_dbsf}/sriov_numvfs sleep $((RANDOM%3+1)) } function do_set_channel() { local nic=$(ls -1 --indicator-style=none /sys/bus/pci/devices/${vf0_dbsf}/net/) [ -z "$nic" ] && { sleep $((RANDOM%3)) ; return 1; } ifconfig $nic 192.168.18.5 netmask 255.255.255.0 ifconfig $nic up ethtool -L $nic combined 1 ethtool -L $nic combined 4 sleep $((RANDOM%3)) } function on_exit() { local pid for pid in "${g_pids[@]}"; do kill -0 "$pid" &>/dev/null && kill "$pid" &>/dev/null done g_pids=() } trap "on_exit; exit" EXIT while :; do do_set_numvf ; done & g_pids+=($!) while :; do do_set_channel ; done & g_pids+=($!) wait Result: [ 4093.900222] ================================================================== [ 4093.900230] BUG: KASAN: use-after-free in free_netdev+0x308/0x390 [ 4093.900232] Read of size 8 at addr ffff88b4dc145640 by task repro.sh/6699 [ 4093.900233] [ 4093.900236] CPU: 10 PID: 6699 Comm: repro.sh Kdump: loaded Tainted: G O --------- -t - 4.18.0 #1 [ 4093.900238] Hardware name: Powerleader PR2008AL/H12DSi-N6, BIOS 2.0 04/09/2021 [ 4093.900239] Call Trace: [ 4093.900244] dump_stack+0x71/0xab [ 4093.900249] print_address_description+0x6b/0x290 [ 4093.900251] ? free_netdev+0x308/0x390 [ 4093.900252] kasan_report+0x14a/0x2b0 [ 4093.900254] free_netdev+0x308/0x390 [ 4093.900261] iavf_remove+0x825/0xd20 [iavf] [ 4093.900265] pci_device_remove+0xa8/0x1f0 [ 4093.900268] device_release_driver_internal+0x1c6/0x460 [ 4093.900271] pci_stop_bus_device+0x101/0x150 [ 4093.900273] pci_stop_and_remove_bus_device+0xe/0x20 [ 4093.900275] pci_iov_remove_virtfn+0x187/0x420 [ 4093.900277] ? pci_iov_add_virtfn+0xe10/0xe10 [ 4093.900278] ? pci_get_subsys+0x90/0x90 [ 4093.900280] sriov_disable+0xed/0x3e0 [ 4093.900282] ? bus_find_device+0x12d/0x1a0 [ 4093.900290] i40e_free_vfs+0x754/0x1210 [i40e] [ 4093.900298] ? i40e_reset_all_vfs+0x880/0x880 [i40e] [ 4093.900299] ? pci_get_device+0x7c/0x90 [ 4093.900300] ? pci_get_subsys+0x90/0x90 [ 4093.900306] ? pci_vfs_assigned.part.7+0x144/0x210 [ 4093.900309] ? __mutex_lock_slowpath+0x10/0x10 [ 4093.900315] i40e_pci_sriov_configure+0x1fa/0x2e0 [i40e] [ 4093.900318] sriov_numvfs_store+0x214/0x290 [ 4093.900320] ? sriov_totalvfs_show+0x30/0x30 [ 4093.900321] ? __mutex_lock_slowpath+0x10/0x10 [ 4093.900323] ? __check_object_size+0x15a/0x350 [ 4093.900326] kernfs_fop_write+0x280/0x3f0 [ 4093.900329] vfs_write+0x145/0x440 [ 4093.900330] ksys_write+0xab/0x160 [ 4093.900332] ? __ia32_sys_read+0xb0/0xb0 [ 4093.900334] ? fput_many+0x1a/0x120 [ 4093.900335] ? filp_close+0xf0/0x130 [ 4093.900338] do_syscall_64+0xa0/0x370 [ 4093.900339] ? page_fault+0x8/0x30 [ 4093.900341] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 4093.900357] RIP: 0033:0x7f16ad4d22c0 [ 4093.900359] Code: 73 01 c3 48 8b 0d d8 cb 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 89 24 2d 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 fe dd 01 00 48 89 04 24 [ 4093.900360] RSP: 002b:00007ffd6491b7f8 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 4093.900362] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f16ad4d22c0 [ 4093.900363] RDX: 0000000000000002 RSI: 0000000001a41408 RDI: 0000000000000001 [ 4093.900364] RBP: 0000000001a41408 R08: 00007f16ad7a1780 R09: 00007f16ae1f2700 [ 4093.900364] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000002 [ 4093.900365] R13: 0000000000000001 R14: 00007f16ad7a0620 R15: 0000000000000001 [ 4093.900367] [ 4093.900368] Allocated by task 820: [ 4093.900371] kasan_kmalloc+0xa6/0xd0 [ 4093.900373] __kmalloc+0xfb/0x200 [ 4093.900376] iavf_init_interrupt_scheme+0x63b/0x1320 [iavf] [ 4093.900380] iavf_watchdog_task+0x3d51/0x52c0 [iavf] [ 4093.900382] process_one_work+0x56a/0x11f0 [ 4093.900383] worker_thread+0x8f/0xf40 [ 4093.900384] kthread+0x2a0/0x390 [ 4093.900385] ret_from_fork+0x1f/0x40 [ 4093.900387] 0xffffffffffffffff [ 4093.900387] [ 4093.900388] Freed by task 6699: [ 4093.900390] __kasan_slab_free+0x137/0x190 [ 4093.900391] kfree+0x8b/0x1b0 [ 4093.900394] iavf_free_q_vectors+0x11d/0x1a0 [iavf] [ 4093.900397] iavf_remove+0x35a/0xd20 [iavf] [ 4093.900399] pci_device_remove+0xa8/0x1f0 [ 4093.900400] device_release_driver_internal+0x1c6/0x460 [ 4093.900401] pci_stop_bus_device+0x101/0x150 [ 4093.900402] pci_stop_and_remove_bus_device+0xe/0x20 [ 4093.900403] pci_iov_remove_virtfn+0x187/0x420 [ 4093.900404] sriov_disable+0xed/0x3e0 [ 4093.900409] i40e_free_vfs+0x754/0x1210 [i40e] [ 4093.900415] i40e_pci_sriov_configure+0x1fa/0x2e0 [i40e] [ 4093.900416] sriov_numvfs_store+0x214/0x290 [ 4093.900417] kernfs_fop_write+0x280/0x3f0 [ 4093.900418] vfs_write+0x145/0x440 [ 4093.900419] ksys_write+0xab/0x160 [ 4093.900420] do_syscall_64+0xa0/0x370 [ 4093.900421] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 4093.900422] 0xffffffffffffffff [ 4093.900422] [ 4093.900424] The buggy address belongs to the object at ffff88b4dc144200 which belongs to the cache kmalloc-8k of size 8192 [ 4093.900425] The buggy address is located 5184 bytes inside of 8192-byte region [ffff88b4dc144200, ffff88b4dc146200) [ 4093.900425] The buggy address belongs to the page: [ 4093.900427] page:ffffea00d3705000 refcount:1 mapcount:0 mapping:ffff88bf04415c80 index:0x0 compound_mapcount: 0 [ 4093.900430] flags: 0x10000000008100(slab|head) [ 4093.900433] raw: 0010000000008100 dead000000000100 dead000000000200 ffff88bf04415c80 [ 4093.900434] raw: 0000000000000000 0000000000030003 00000001ffffffff 0000000000000000 [ 4093.900434] page dumped because: kasan: bad access detected [ 4093.900435] [ 4093.900435] Memory state around the buggy address: [ 4093.900436] ffff88b4dc145500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 4093.900437] ffff88b4dc145580: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 4093.900438] >ffff88b4dc145600: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 4093.900438] ^ [ 4093.900439] ffff88b4dc145680: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 4093.900440] ffff88b4dc145700: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb [ 4093.900440] ================================================================== Although the patch #2 (of 2) can avoid the issue triggered by this repro.sh, there still are other potential risks that if num_active_queues is changed to less than allocated q_vectors[] by unexpected, the mismatched netif_napi_add/del() can also cause UAF. Since we actually call netif_napi_add() for all allocated q_vectors unconditionally in iavf_alloc_q_vectors(), so we should fix it by letting netif_napi_del() match to netif_napi_add(). Fixes: 5eae00c57f5e ("i40evf: main driver core") Signed-off-by: Ding Hui Cc: Donglin Peng Cc: Huang Cun Reviewed-by: Simon Horman Reviewed-by: Madhu Chittim Reviewed-by: Leon Romanovsky Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index a483eb185c99..89d88c3a7add 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1828,19 +1828,16 @@ static int iavf_alloc_q_vectors(struct iavf_adapter *adapter) static void iavf_free_q_vectors(struct iavf_adapter *adapter) { int q_idx, num_q_vectors; - int napi_vectors; if (!adapter->q_vectors) return; num_q_vectors = adapter->num_msix_vectors - NONQ_VECS; - napi_vectors = adapter->num_active_queues; for (q_idx = 0; q_idx < num_q_vectors; q_idx++) { struct iavf_q_vector *q_vector = &adapter->q_vectors[q_idx]; - if (q_idx < napi_vectors) - netif_napi_del(&q_vector->napi); + netif_napi_del(&q_vector->napi); } kfree(adapter->q_vectors); adapter->q_vectors = NULL; -- cgit v1.2.3 From 7c4bced3caa749ce468b0c5de711c98476b23a52 Mon Sep 17 00:00:00 2001 From: Ding Hui Date: Tue, 9 May 2023 19:11:48 +0800 Subject: iavf: Fix out-of-bounds when setting channels on remove If we set channels greater during iavf_remove(), and waiting reset done would be timeout, then returned with error but changed num_active_queues directly, that will lead to OOB like the following logs. Because the num_active_queues is greater than tx/rx_rings[] allocated actually. Reproducer: [root@host ~]# cat repro.sh #!/bin/bash pf_dbsf="0000:41:00.0" vf0_dbsf="0000:41:02.0" g_pids=() function do_set_numvf() { echo 2 >/sys/bus/pci/devices/${pf_dbsf}/sriov_numvfs sleep $((RANDOM%3+1)) echo 0 >/sys/bus/pci/devices/${pf_dbsf}/sriov_numvfs sleep $((RANDOM%3+1)) } function do_set_channel() { local nic=$(ls -1 --indicator-style=none /sys/bus/pci/devices/${vf0_dbsf}/net/) [ -z "$nic" ] && { sleep $((RANDOM%3)) ; return 1; } ifconfig $nic 192.168.18.5 netmask 255.255.255.0 ifconfig $nic up ethtool -L $nic combined 1 ethtool -L $nic combined 4 sleep $((RANDOM%3)) } function on_exit() { local pid for pid in "${g_pids[@]}"; do kill -0 "$pid" &>/dev/null && kill "$pid" &>/dev/null done g_pids=() } trap "on_exit; exit" EXIT while :; do do_set_numvf ; done & g_pids+=($!) while :; do do_set_channel ; done & g_pids+=($!) wait Result: [ 3506.152887] iavf 0000:41:02.0: Removing device [ 3510.400799] ================================================================== [ 3510.400820] BUG: KASAN: slab-out-of-bounds in iavf_free_all_tx_resources+0x156/0x160 [iavf] [ 3510.400823] Read of size 8 at addr ffff88b6f9311008 by task repro.sh/55536 [ 3510.400823] [ 3510.400830] CPU: 101 PID: 55536 Comm: repro.sh Kdump: loaded Tainted: G O --------- -t - 4.18.0 #1 [ 3510.400832] Hardware name: Powerleader PR2008AL/H12DSi-N6, BIOS 2.0 04/09/2021 [ 3510.400835] Call Trace: [ 3510.400851] dump_stack+0x71/0xab [ 3510.400860] print_address_description+0x6b/0x290 [ 3510.400865] ? iavf_free_all_tx_resources+0x156/0x160 [iavf] [ 3510.400868] kasan_report+0x14a/0x2b0 [ 3510.400873] iavf_free_all_tx_resources+0x156/0x160 [iavf] [ 3510.400880] iavf_remove+0x2b6/0xc70 [iavf] [ 3510.400884] ? iavf_free_all_rx_resources+0x160/0x160 [iavf] [ 3510.400891] ? wait_woken+0x1d0/0x1d0 [ 3510.400895] ? notifier_call_chain+0xc1/0x130 [ 3510.400903] pci_device_remove+0xa8/0x1f0 [ 3510.400910] device_release_driver_internal+0x1c6/0x460 [ 3510.400916] pci_stop_bus_device+0x101/0x150 [ 3510.400919] pci_stop_and_remove_bus_device+0xe/0x20 [ 3510.400924] pci_iov_remove_virtfn+0x187/0x420 [ 3510.400927] ? pci_iov_add_virtfn+0xe10/0xe10 [ 3510.400929] ? pci_get_subsys+0x90/0x90 [ 3510.400932] sriov_disable+0xed/0x3e0 [ 3510.400936] ? bus_find_device+0x12d/0x1a0 [ 3510.400953] i40e_free_vfs+0x754/0x1210 [i40e] [ 3510.400966] ? i40e_reset_all_vfs+0x880/0x880 [i40e] [ 3510.400968] ? pci_get_device+0x7c/0x90 [ 3510.400970] ? pci_get_subsys+0x90/0x90 [ 3510.400982] ? pci_vfs_assigned.part.7+0x144/0x210 [ 3510.400987] ? __mutex_lock_slowpath+0x10/0x10 [ 3510.400996] i40e_pci_sriov_configure+0x1fa/0x2e0 [i40e] [ 3510.401001] sriov_numvfs_store+0x214/0x290 [ 3510.401005] ? sriov_totalvfs_show+0x30/0x30 [ 3510.401007] ? __mutex_lock_slowpath+0x10/0x10 [ 3510.401011] ? __check_object_size+0x15a/0x350 [ 3510.401018] kernfs_fop_write+0x280/0x3f0 [ 3510.401022] vfs_write+0x145/0x440 [ 3510.401025] ksys_write+0xab/0x160 [ 3510.401028] ? __ia32_sys_read+0xb0/0xb0 [ 3510.401031] ? fput_many+0x1a/0x120 [ 3510.401032] ? filp_close+0xf0/0x130 [ 3510.401038] do_syscall_64+0xa0/0x370 [ 3510.401041] ? page_fault+0x8/0x30 [ 3510.401043] entry_SYSCALL_64_after_hwframe+0x65/0xca [ 3510.401073] RIP: 0033:0x7f3a9bb842c0 [ 3510.401079] Code: 73 01 c3 48 8b 0d d8 cb 2c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 0f 1f 44 00 00 83 3d 89 24 2d 00 00 75 10 b8 01 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 fe dd 01 00 48 89 04 24 [ 3510.401080] RSP: 002b:00007ffc05f1fe18 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 3510.401083] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f3a9bb842c0 [ 3510.401085] RDX: 0000000000000002 RSI: 0000000002327408 RDI: 0000000000000001 [ 3510.401086] RBP: 0000000002327408 R08: 00007f3a9be53780 R09: 00007f3a9c8a4700 [ 3510.401086] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000002 [ 3510.401087] R13: 0000000000000001 R14: 00007f3a9be52620 R15: 0000000000000001 [ 3510.401090] [ 3510.401093] Allocated by task 76795: [ 3510.401098] kasan_kmalloc+0xa6/0xd0 [ 3510.401099] __kmalloc+0xfb/0x200 [ 3510.401104] iavf_init_interrupt_scheme+0x26f/0x1310 [iavf] [ 3510.401108] iavf_watchdog_task+0x1d58/0x4050 [iavf] [ 3510.401114] process_one_work+0x56a/0x11f0 [ 3510.401115] worker_thread+0x8f/0xf40 [ 3510.401117] kthread+0x2a0/0x390 [ 3510.401119] ret_from_fork+0x1f/0x40 [ 3510.401122] 0xffffffffffffffff [ 3510.401123] In timeout handling, we should keep the original num_active_queues and reset num_req_queues to 0. Fixes: 4e5e6b5d9d13 ("iavf: Fix return of set the new channel count") Signed-off-by: Ding Hui Cc: Donglin Peng Cc: Huang Cun Reviewed-by: Leon Romanovsky Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 6f171d1d85b7..92443f8e9fbd 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -1863,7 +1863,7 @@ static int iavf_set_channels(struct net_device *netdev, } if (i == IAVF_RESET_WAIT_COMPLETE_COUNT) { adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; - adapter->num_active_queues = num_req; + adapter->num_req_queues = 0; return -EOPNOTSUPP; } -- cgit v1.2.3 From a77ed5c5b768e9649be240a2d864e5cd9c6a2015 Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Fri, 19 May 2023 15:46:02 -0600 Subject: iavf: use internal state to free traffic IRQs If the system tries to close the netdev while iavf_reset_task() is running, __LINK_STATE_START will be cleared and netif_running() will return false in iavf_reinit_interrupt_scheme(). This will result in iavf_free_traffic_irqs() not being called and a leak as follows: [7632.489326] remove_proc_entry: removing non-empty directory 'irq/999', leaking at least 'iavf-enp24s0f0v0-TxRx-0' [7632.490214] WARNING: CPU: 0 PID: 10 at fs/proc/generic.c:718 remove_proc_entry+0x19b/0x1b0 is shown when pci_disable_msix() is later called. Fix by using the internal adapter state. The traffic IRQs will always exist if state == __IAVF_RUNNING. Fixes: 5b36e8d04b44 ("i40evf: Enable VF to request an alternate queue allocation") Signed-off-by: Ahmed Zaki Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 89d88c3a7add..058e19c6f94a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1929,15 +1929,16 @@ static void iavf_free_rss(struct iavf_adapter *adapter) /** * iavf_reinit_interrupt_scheme - Reallocate queues and vectors * @adapter: board private structure + * @running: true if adapter->state == __IAVF_RUNNING * * Returns 0 on success, negative on failure **/ -static int iavf_reinit_interrupt_scheme(struct iavf_adapter *adapter) +static int iavf_reinit_interrupt_scheme(struct iavf_adapter *adapter, bool running) { struct net_device *netdev = adapter->netdev; int err; - if (netif_running(netdev)) + if (running) iavf_free_traffic_irqs(adapter); iavf_free_misc_irq(adapter); iavf_reset_interrupt_capability(adapter); @@ -3053,7 +3054,7 @@ continue_reset: if ((adapter->flags & IAVF_FLAG_REINIT_MSIX_NEEDED) || (adapter->flags & IAVF_FLAG_REINIT_ITR_NEEDED)) { - err = iavf_reinit_interrupt_scheme(adapter); + err = iavf_reinit_interrupt_scheme(adapter, running); if (err) goto reset_err; } -- cgit v1.2.3 From c2ed2403f12c74a74a0091ed5d830e72c58406e8 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Mon, 5 Jun 2023 10:52:22 -0400 Subject: iavf: Wait for reset in callbacks which trigger it There was a fail when trying to add the interface to bonding right after changing the MTU on the interface. It was caused by bonding interface unable to open the interface due to interface being in __RESETTING state because of MTU change. Add new reset_waitqueue to indicate that reset has finished. Add waiting for reset to finish in callbacks which trigger hw reset: iavf_set_priv_flags(), iavf_change_mtu() and iavf_set_ringparam(). We use a 5000ms timeout period because on Hyper-V based systems, this operation takes around 3000-4000ms. In normal circumstances, it doesn't take more than 500ms to complete. Add a function iavf_wait_for_reset() to reuse waiting for reset code and use it also in iavf_set_channels(), which already waits for reset. We don't use error handling in iavf_set_channels() as this could cause the device to be in incorrect state if the reset was scheduled but hit timeout or the waitng function was interrupted by a signal. Fixes: 4e5e6b5d9d13 ("iavf: Fix return of set the new channel count") Signed-off-by: Marcin Szycik Co-developed-by: Dawid Wesierski Signed-off-by: Dawid Wesierski Signed-off-by: Sylwester Dziedziuch Signed-off-by: Kamil Maziarz Signed-off-by: Mateusz Palczewski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 2 + drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 31 ++++++++------- drivers/net/ethernet/intel/iavf/iavf_main.c | 51 ++++++++++++++++++++++++- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 1 + 4 files changed, 68 insertions(+), 17 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index f80f2735e688..a5cab19eb6a8 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -257,6 +257,7 @@ struct iavf_adapter { struct work_struct adminq_task; struct delayed_work client_task; wait_queue_head_t down_waitqueue; + wait_queue_head_t reset_waitqueue; wait_queue_head_t vc_waitqueue; struct iavf_q_vector *q_vectors; struct list_head vlan_filter_list; @@ -582,4 +583,5 @@ void iavf_add_adv_rss_cfg(struct iavf_adapter *adapter); void iavf_del_adv_rss_cfg(struct iavf_adapter *adapter); struct iavf_mac_filter *iavf_add_filter(struct iavf_adapter *adapter, const u8 *macaddr); +int iavf_wait_for_reset(struct iavf_adapter *adapter); #endif /* _IAVF_H_ */ diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index 92443f8e9fbd..b7141c2a941d 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -484,6 +484,7 @@ static int iavf_set_priv_flags(struct net_device *netdev, u32 flags) { struct iavf_adapter *adapter = netdev_priv(netdev); u32 orig_flags, new_flags, changed_flags; + int ret = 0; u32 i; orig_flags = READ_ONCE(adapter->flags); @@ -533,10 +534,13 @@ static int iavf_set_priv_flags(struct net_device *netdev, u32 flags) if (netif_running(netdev)) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; queue_work(adapter->wq, &adapter->reset_task); + ret = iavf_wait_for_reset(adapter); + if (ret) + netdev_warn(netdev, "Changing private flags timeout or interrupted waiting for reset"); } } - return 0; + return ret; } /** @@ -627,6 +631,7 @@ static int iavf_set_ringparam(struct net_device *netdev, { struct iavf_adapter *adapter = netdev_priv(netdev); u32 new_rx_count, new_tx_count; + int ret = 0; if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending)) return -EINVAL; @@ -673,9 +678,12 @@ static int iavf_set_ringparam(struct net_device *netdev, if (netif_running(netdev)) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; queue_work(adapter->wq, &adapter->reset_task); + ret = iavf_wait_for_reset(adapter); + if (ret) + netdev_warn(netdev, "Changing ring parameters timeout or interrupted waiting for reset"); } - return 0; + return ret; } /** @@ -1830,7 +1838,7 @@ static int iavf_set_channels(struct net_device *netdev, { struct iavf_adapter *adapter = netdev_priv(netdev); u32 num_req = ch->combined_count; - int i; + int ret = 0; if ((adapter->vf_res->vf_cap_flags & VIRTCHNL_VF_OFFLOAD_ADQ) && adapter->num_tc) { @@ -1854,20 +1862,11 @@ static int iavf_set_channels(struct net_device *netdev, adapter->flags |= IAVF_FLAG_REINIT_ITR_NEEDED; iavf_schedule_reset(adapter); - /* wait for the reset is done */ - for (i = 0; i < IAVF_RESET_WAIT_COMPLETE_COUNT; i++) { - msleep(IAVF_RESET_WAIT_MS); - if (adapter->flags & IAVF_FLAG_RESET_PENDING) - continue; - break; - } - if (i == IAVF_RESET_WAIT_COMPLETE_COUNT) { - adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; - adapter->num_req_queues = 0; - return -EOPNOTSUPP; - } + ret = iavf_wait_for_reset(adapter); + if (ret) + netdev_warn(netdev, "Changing channel count timeout or interrupted waiting for reset"); - return 0; + return ret; } /** diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 058e19c6f94a..b89933aa5bfe 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -166,6 +166,45 @@ static struct iavf_adapter *iavf_pdev_to_adapter(struct pci_dev *pdev) return netdev_priv(pci_get_drvdata(pdev)); } +/** + * iavf_is_reset_in_progress - Check if a reset is in progress + * @adapter: board private structure + */ +static bool iavf_is_reset_in_progress(struct iavf_adapter *adapter) +{ + if (adapter->state == __IAVF_RESETTING || + adapter->flags & (IAVF_FLAG_RESET_PENDING | + IAVF_FLAG_RESET_NEEDED)) + return true; + + return false; +} + +/** + * iavf_wait_for_reset - Wait for reset to finish. + * @adapter: board private structure + * + * Returns 0 if reset finished successfully, negative on timeout or interrupt. + */ +int iavf_wait_for_reset(struct iavf_adapter *adapter) +{ + int ret = wait_event_interruptible_timeout(adapter->reset_waitqueue, + !iavf_is_reset_in_progress(adapter), + msecs_to_jiffies(5000)); + + /* If ret < 0 then it means wait was interrupted. + * If ret == 0 then it means we got a timeout while waiting + * for reset to finish. + * If ret > 0 it means reset has finished. + */ + if (ret > 0) + return 0; + else if (ret < 0) + return -EINTR; + else + return -EBUSY; +} + /** * iavf_allocate_dma_mem_d - OS specific memory alloc for shared code * @hw: pointer to the HW structure @@ -3149,6 +3188,7 @@ continue_reset: adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; + wake_up(&adapter->reset_waitqueue); mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); @@ -4313,6 +4353,7 @@ static int iavf_close(struct net_device *netdev) static int iavf_change_mtu(struct net_device *netdev, int new_mtu) { struct iavf_adapter *adapter = netdev_priv(netdev); + int ret = 0; netdev_dbg(netdev, "changing MTU from %d to %d\n", netdev->mtu, new_mtu); @@ -4325,9 +4366,14 @@ static int iavf_change_mtu(struct net_device *netdev, int new_mtu) if (netif_running(netdev)) { adapter->flags |= IAVF_FLAG_RESET_NEEDED; queue_work(adapter->wq, &adapter->reset_task); + ret = iavf_wait_for_reset(adapter); + if (ret < 0) + netdev_warn(netdev, "MTU change interrupted waiting for reset"); + else if (ret) + netdev_warn(netdev, "MTU change timed out waiting for reset"); } - return 0; + return ret; } #define NETIF_VLAN_OFFLOAD_FEATURES (NETIF_F_HW_VLAN_CTAG_RX | \ @@ -4928,6 +4974,9 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) /* Setup the wait queue for indicating transition to down status */ init_waitqueue_head(&adapter->down_waitqueue); + /* Setup the wait queue for indicating transition to running state */ + init_waitqueue_head(&adapter->reset_waitqueue); + /* Setup the wait queue for indicating virtchannel events */ init_waitqueue_head(&adapter->vc_waitqueue); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 7c0578b5457b..1bab896aaf40 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -2285,6 +2285,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, case VIRTCHNL_OP_ENABLE_QUEUES: /* enable transmits */ iavf_irq_enable(adapter, true); + wake_up(&adapter->reset_waitqueue); adapter->flags &= ~IAVF_FLAG_QUEUES_DISABLED; break; case VIRTCHNL_OP_DISABLE_QUEUES: -- cgit v1.2.3 From d2806d960e8387945b53edf7ed9d71ab3ab5f073 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Mon, 5 Jun 2023 10:52:23 -0400 Subject: Revert "iavf: Detach device during reset task" This reverts commit aa626da947e9cd30c4cf727493903e1adbb2c0a0. Detaching device during reset was not fully fixing the rtnl locking issue, as there could be a situation where callback was already in progress before detaching netdev. Furthermore, detaching netdevice causes TX timeouts if traffic is running. To reproduce: ip netns exec ns1 iperf3 -c $PEER_IP -t 600 --logfile /dev/null & while :; do for i in 200 7000 400 5000 300 3000 ; do ip netns exec ns1 ip link set $VF1 mtu $i sleep 2 done sleep 10 done Currently, callbacks such as iavf_change_mtu() wait for the reset. If the reset fails to acquire the rtnl_lock, they schedule the netdev update for later while continuing the reset flow. Operations like MTU changes are performed under the rtnl_lock. Therefore, when the operation finishes, another callback that uses rtnl_lock can start. Signed-off-by: Dawid Wesierski Signed-off-by: Marcin Szycik Signed-off-by: Mateusz Palczewski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index b89933aa5bfe..957ad6e63f6f 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -2977,11 +2977,6 @@ static void iavf_reset_task(struct work_struct *work) int i = 0, err; bool running; - /* Detach interface to avoid subsequent NDO callbacks */ - rtnl_lock(); - netif_device_detach(netdev); - rtnl_unlock(); - /* When device is being removed it doesn't make sense to run the reset * task, just return in such a case. */ @@ -2989,7 +2984,7 @@ static void iavf_reset_task(struct work_struct *work) if (adapter->state != __IAVF_REMOVE) queue_work(adapter->wq, &adapter->reset_task); - goto reset_finish; + return; } while (!mutex_trylock(&adapter->client_lock)) @@ -3192,7 +3187,7 @@ continue_reset: mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); - goto reset_finish; + return; reset_err: if (running) { set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); @@ -3213,10 +3208,6 @@ reset_err: } dev_err(&adapter->pdev->dev, "failed to allocate resources during reinit\n"); -reset_finish: - rtnl_lock(); - netif_device_attach(netdev); - rtnl_unlock(); } /** -- cgit v1.2.3 From d916d273041b0b5652434ff27aec716ff90992ac Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Mon, 5 Jun 2023 10:52:24 -0400 Subject: Revert "iavf: Do not restart Tx queues after reset task failure" This reverts commit 08f1c147b7265245d67321585c68a27e990e0c4b. Netdev is no longer being detached during reset, so this fix can be reverted. We leave the removal of "hacky" IFF_UP flag update. Signed-off-by: Marcin Szycik Signed-off-by: Mateusz Palczewski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf_main.c | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 957ad6e63f6f..0c12a752429c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -3042,11 +3042,6 @@ static void iavf_reset_task(struct work_struct *work) iavf_disable_vf(adapter); mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); - if (netif_running(netdev)) { - rtnl_lock(); - dev_close(netdev); - rtnl_unlock(); - } return; /* Do not attempt to reinit. It's dead, Jim. */ } @@ -3197,16 +3192,6 @@ reset_err: mutex_unlock(&adapter->client_lock); mutex_unlock(&adapter->crit_lock); - - if (netif_running(netdev)) { - /* Close device to ensure that Tx queues will not be started - * during netif_device_attach() at the end of the reset task. - */ - rtnl_lock(); - dev_close(netdev); - rtnl_unlock(); - } - dev_err(&adapter->pdev->dev, "failed to allocate resources during reinit\n"); } -- cgit v1.2.3 From d1639a17319ba78a018280cd2df6577a7e5d9fab Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Mon, 5 Jun 2023 10:52:25 -0400 Subject: iavf: fix a deadlock caused by rtnl and driver's lock circular dependencies A driver's lock (crit_lock) is used to serialize all the driver's tasks. Lockdep, however, shows a circular dependency between rtnl and crit_lock. This happens when an ndo that already holds the rtnl requests the driver to reset, since the reset task (in some paths) tries to grab rtnl to either change real number of queues of update netdev features. [566.241851] ====================================================== [566.241893] WARNING: possible circular locking dependency detected [566.241936] 6.2.14-100.fc36.x86_64+debug #1 Tainted: G OE [566.241984] ------------------------------------------------------ [566.242025] repro.sh/2604 is trying to acquire lock: [566.242061] ffff9280fc5ceee8 (&adapter->crit_lock){+.+.}-{3:3}, at: iavf_close+0x3c/0x240 [iavf] [566.242167] but task is already holding lock: [566.242209] ffffffff9976d350 (rtnl_mutex){+.+.}-{3:3}, at: iavf_remove+0x6b5/0x730 [iavf] [566.242300] which lock already depends on the new lock. [566.242353] the existing dependency chain (in reverse order) is: [566.242401] -> #1 (rtnl_mutex){+.+.}-{3:3}: [566.242451] __mutex_lock+0xc1/0xbb0 [566.242489] iavf_init_interrupt_scheme+0x179/0x440 [iavf] [566.242560] iavf_watchdog_task+0x80b/0x1400 [iavf] [566.242627] process_one_work+0x2b3/0x560 [566.242663] worker_thread+0x4f/0x3a0 [566.242696] kthread+0xf2/0x120 [566.242730] ret_from_fork+0x29/0x50 [566.242763] -> #0 (&adapter->crit_lock){+.+.}-{3:3}: [566.242815] __lock_acquire+0x15ff/0x22b0 [566.242869] lock_acquire+0xd2/0x2c0 [566.242901] __mutex_lock+0xc1/0xbb0 [566.242934] iavf_close+0x3c/0x240 [iavf] [566.242997] __dev_close_many+0xac/0x120 [566.243036] dev_close_many+0x8b/0x140 [566.243071] unregister_netdevice_many_notify+0x165/0x7c0 [566.243116] unregister_netdevice_queue+0xd3/0x110 [566.243157] iavf_remove+0x6c1/0x730 [iavf] [566.243217] pci_device_remove+0x33/0xa0 [566.243257] device_release_driver_internal+0x1bc/0x240 [566.243299] pci_stop_bus_device+0x6c/0x90 [566.243338] pci_stop_and_remove_bus_device+0xe/0x20 [566.243380] pci_iov_remove_virtfn+0xd1/0x130 [566.243417] sriov_disable+0x34/0xe0 [566.243448] ice_free_vfs+0x2da/0x330 [ice] [566.244383] ice_sriov_configure+0x88/0xad0 [ice] [566.245353] sriov_numvfs_store+0xde/0x1d0 [566.246156] kernfs_fop_write_iter+0x15e/0x210 [566.246921] vfs_write+0x288/0x530 [566.247671] ksys_write+0x74/0xf0 [566.248408] do_syscall_64+0x58/0x80 [566.249145] entry_SYSCALL_64_after_hwframe+0x72/0xdc [566.249886] other info that might help us debug this: [566.252014] Possible unsafe locking scenario: [566.253432] CPU0 CPU1 [566.254118] ---- ---- [566.254800] lock(rtnl_mutex); [566.255514] lock(&adapter->crit_lock); [566.256233] lock(rtnl_mutex); [566.256897] lock(&adapter->crit_lock); [566.257388] *** DEADLOCK *** The deadlock can be triggered by a script that is continuously resetting the VF adapter while doing other operations requiring RTNL, e.g: while :; do ip link set $VF up ethtool --set-channels $VF combined 2 ip link set $VF down ip link set $VF up ethtool --set-channels $VF combined 4 ip link set $VF down done Any operation that triggers a reset can substitute "ethtool --set-channles" As a fix, add a new task "finish_config" that do all the work which needs rtnl lock. With the exception of iavf_remove(), all work that require rtnl should be called from this task. As for iavf_remove(), at the point where we need to call unregister_netdevice() (and grab rtnl_lock), we make sure the finish_config task is not running (cancel_work_sync()) to safely grab rtnl. Subsequent finish_config work cannot restart after that since the task is guarded by the __IAVF_IN_REMOVE_TASK bit in iavf_schedule_finish_config(). Fixes: 5ac49f3c2702 ("iavf: use mutexes for locking of critical sections") Signed-off-by: Ahmed Zaki Signed-off-by: Mateusz Palczewski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 2 + drivers/net/ethernet/intel/iavf/iavf_main.c | 114 +++++++++++++++++------- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 1 + 3 files changed, 85 insertions(+), 32 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index a5cab19eb6a8..bf5e3c8e97e0 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -255,6 +255,7 @@ struct iavf_adapter { struct workqueue_struct *wq; struct work_struct reset_task; struct work_struct adminq_task; + struct work_struct finish_config; struct delayed_work client_task; wait_queue_head_t down_waitqueue; wait_queue_head_t reset_waitqueue; @@ -521,6 +522,7 @@ int iavf_process_config(struct iavf_adapter *adapter); int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter); void iavf_schedule_reset(struct iavf_adapter *adapter); void iavf_schedule_request_stats(struct iavf_adapter *adapter); +void iavf_schedule_finish_config(struct iavf_adapter *adapter); void iavf_reset(struct iavf_adapter *adapter); void iavf_set_ethtool_ops(struct net_device *netdev); void iavf_update_stats(struct iavf_adapter *adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 0c12a752429c..2a2ae3c4337b 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -1690,10 +1690,10 @@ static int iavf_set_interrupt_capability(struct iavf_adapter *adapter) adapter->msix_entries[vector].entry = vector; err = iavf_acquire_msix_vectors(adapter, v_budget); + if (!err) + iavf_schedule_finish_config(adapter); out: - netif_set_real_num_rx_queues(adapter->netdev, pairs); - netif_set_real_num_tx_queues(adapter->netdev, pairs); return err; } @@ -1913,9 +1913,7 @@ static int iavf_init_interrupt_scheme(struct iavf_adapter *adapter) goto err_alloc_queues; } - rtnl_lock(); err = iavf_set_interrupt_capability(adapter); - rtnl_unlock(); if (err) { dev_err(&adapter->pdev->dev, "Unable to setup interrupt capabilities\n"); @@ -2001,6 +1999,78 @@ err: return err; } +/** + * iavf_finish_config - do all netdev work that needs RTNL + * @work: our work_struct + * + * Do work that needs both RTNL and crit_lock. + **/ +static void iavf_finish_config(struct work_struct *work) +{ + struct iavf_adapter *adapter; + int pairs, err; + + adapter = container_of(work, struct iavf_adapter, finish_config); + + /* Always take RTNL first to prevent circular lock dependency */ + rtnl_lock(); + mutex_lock(&adapter->crit_lock); + + if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES) && + adapter->netdev_registered && + !test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) { + netdev_update_features(adapter->netdev); + adapter->flags &= ~IAVF_FLAG_SETUP_NETDEV_FEATURES; + } + + switch (adapter->state) { + case __IAVF_DOWN: + if (!adapter->netdev_registered) { + err = register_netdevice(adapter->netdev); + if (err) { + dev_err(&adapter->pdev->dev, "Unable to register netdev (%d)\n", + err); + + /* go back and try again.*/ + iavf_free_rss(adapter); + iavf_free_misc_irq(adapter); + iavf_reset_interrupt_capability(adapter); + iavf_change_state(adapter, + __IAVF_INIT_CONFIG_ADAPTER); + goto out; + } + adapter->netdev_registered = true; + } + + /* Set the real number of queues when reset occurs while + * state == __IAVF_DOWN + */ + fallthrough; + case __IAVF_RUNNING: + pairs = adapter->num_active_queues; + netif_set_real_num_rx_queues(adapter->netdev, pairs); + netif_set_real_num_tx_queues(adapter->netdev, pairs); + break; + + default: + break; + } + +out: + mutex_unlock(&adapter->crit_lock); + rtnl_unlock(); +} + +/** + * iavf_schedule_finish_config - Set the flags and schedule a reset event + * @adapter: board private structure + **/ +void iavf_schedule_finish_config(struct iavf_adapter *adapter) +{ + if (!test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section)) + queue_work(adapter->wq, &adapter->finish_config); +} + /** * iavf_process_aq_command - process aq_required flags * and sends aq command @@ -2638,22 +2708,8 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter) netif_carrier_off(netdev); adapter->link_up = false; - - /* set the semaphore to prevent any callbacks after device registration - * up to time when state of driver will be set to __IAVF_DOWN - */ - rtnl_lock(); - if (!adapter->netdev_registered) { - err = register_netdevice(netdev); - if (err) { - rtnl_unlock(); - goto err_register; - } - } - - adapter->netdev_registered = true; - netif_tx_stop_all_queues(netdev); + if (CLIENT_ALLOWED(adapter)) { err = iavf_lan_add_device(adapter); if (err) @@ -2666,7 +2722,6 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter) iavf_change_state(adapter, __IAVF_DOWN); set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); - rtnl_unlock(); iavf_misc_irq_enable(adapter); wake_up(&adapter->down_waitqueue); @@ -2686,10 +2741,11 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter) /* request initial VLAN offload settings */ iavf_set_vlan_offload_features(adapter, 0, netdev->features); + iavf_schedule_finish_config(adapter); return; + err_mem: iavf_free_rss(adapter); -err_register: iavf_free_misc_irq(adapter); err_sw_init: iavf_reset_interrupt_capability(adapter); @@ -2716,15 +2772,6 @@ static void iavf_watchdog_task(struct work_struct *work) goto restart_watchdog; } - if ((adapter->flags & IAVF_FLAG_SETUP_NETDEV_FEATURES) && - adapter->netdev_registered && - !test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section) && - rtnl_trylock()) { - netdev_update_features(adapter->netdev); - rtnl_unlock(); - adapter->flags &= ~IAVF_FLAG_SETUP_NETDEV_FEATURES; - } - if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); @@ -4942,6 +4989,7 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) INIT_WORK(&adapter->reset_task, iavf_reset_task); INIT_WORK(&adapter->adminq_task, iavf_adminq_task); + INIT_WORK(&adapter->finish_config, iavf_finish_config); INIT_DELAYED_WORK(&adapter->watchdog_task, iavf_watchdog_task); INIT_DELAYED_WORK(&adapter->client_task, iavf_client_task); queue_delayed_work(adapter->wq, &adapter->watchdog_task, @@ -5084,13 +5132,15 @@ static void iavf_remove(struct pci_dev *pdev) usleep_range(500, 1000); } cancel_delayed_work_sync(&adapter->watchdog_task); + cancel_work_sync(&adapter->finish_config); + rtnl_lock(); if (adapter->netdev_registered) { - rtnl_lock(); unregister_netdevice(netdev); adapter->netdev_registered = false; - rtnl_unlock(); } + rtnl_unlock(); + if (CLIENT_ALLOWED(adapter)) { err = iavf_lan_del_device(adapter); if (err) diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 1bab896aaf40..073ac29ed84c 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -2237,6 +2237,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, iavf_process_config(adapter); adapter->flags |= IAVF_FLAG_SETUP_NETDEV_FEATURES; + iavf_schedule_finish_config(adapter); iavf_set_queue_vlan_tag_loc(adapter); -- cgit v1.2.3 From c34743daca0eb1dc855831a5210f0800a850088e Mon Sep 17 00:00:00 2001 From: Ahmed Zaki Date: Mon, 5 Jun 2023 10:52:26 -0400 Subject: iavf: fix reset task race with iavf_remove() The reset task is currently scheduled from the watchdog or adminq tasks. First, all direct calls to schedule the reset task are replaced with the iavf_schedule_reset(), which is modified to accept the flag showing the type of reset. To prevent the reset task from starting once iavf_remove() starts, we need to check the __IAVF_IN_REMOVE_TASK bit before we schedule it. This is now easily added to iavf_schedule_reset(). Finally, remove the check for IAVF_FLAG_RESET_NEEDED in the watchdog task. It is redundant since all callers who set the flag immediately schedules the reset task. Fixes: 3ccd54ef44eb ("iavf: Fix init state closure on remove") Fixes: 14756b2ae265 ("iavf: Fix __IAVF_RESETTING state usage") Signed-off-by: Ahmed Zaki Signed-off-by: Mateusz Palczewski Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/iavf/iavf.h | 2 +- drivers/net/ethernet/intel/iavf/iavf_ethtool.c | 8 +++---- drivers/net/ethernet/intel/iavf/iavf_main.c | 32 +++++++++---------------- drivers/net/ethernet/intel/iavf/iavf_virtchnl.c | 3 +-- 4 files changed, 16 insertions(+), 29 deletions(-) diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h index bf5e3c8e97e0..8cbdebc5b698 100644 --- a/drivers/net/ethernet/intel/iavf/iavf.h +++ b/drivers/net/ethernet/intel/iavf/iavf.h @@ -520,7 +520,7 @@ int iavf_up(struct iavf_adapter *adapter); void iavf_down(struct iavf_adapter *adapter); int iavf_process_config(struct iavf_adapter *adapter); int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter); -void iavf_schedule_reset(struct iavf_adapter *adapter); +void iavf_schedule_reset(struct iavf_adapter *adapter, u64 flags); void iavf_schedule_request_stats(struct iavf_adapter *adapter); void iavf_schedule_finish_config(struct iavf_adapter *adapter); void iavf_reset(struct iavf_adapter *adapter); diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c index b7141c2a941d..2f47cfa7f06e 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c +++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c @@ -532,8 +532,7 @@ static int iavf_set_priv_flags(struct net_device *netdev, u32 flags) /* issue a reset to force legacy-rx change to take effect */ if (changed_flags & IAVF_FLAG_LEGACY_RX) { if (netif_running(netdev)) { - adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(adapter->wq, &adapter->reset_task); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_NEEDED); ret = iavf_wait_for_reset(adapter); if (ret) netdev_warn(netdev, "Changing private flags timeout or interrupted waiting for reset"); @@ -676,8 +675,7 @@ static int iavf_set_ringparam(struct net_device *netdev, } if (netif_running(netdev)) { - adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(adapter->wq, &adapter->reset_task); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_NEEDED); ret = iavf_wait_for_reset(adapter); if (ret) netdev_warn(netdev, "Changing ring parameters timeout or interrupted waiting for reset"); @@ -1860,7 +1858,7 @@ static int iavf_set_channels(struct net_device *netdev, adapter->num_req_queues = num_req; adapter->flags |= IAVF_FLAG_REINIT_ITR_NEEDED; - iavf_schedule_reset(adapter); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_NEEDED); ret = iavf_wait_for_reset(adapter); if (ret) diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c index 2a2ae3c4337b..3a88d413ddee 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_main.c +++ b/drivers/net/ethernet/intel/iavf/iavf_main.c @@ -301,12 +301,14 @@ static int iavf_lock_timeout(struct mutex *lock, unsigned int msecs) /** * iavf_schedule_reset - Set the flags and schedule a reset event * @adapter: board private structure + * @flags: IAVF_FLAG_RESET_PENDING or IAVF_FLAG_RESET_NEEDED **/ -void iavf_schedule_reset(struct iavf_adapter *adapter) +void iavf_schedule_reset(struct iavf_adapter *adapter, u64 flags) { - if (!(adapter->flags & - (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED))) { - adapter->flags |= IAVF_FLAG_RESET_NEEDED; + if (!test_bit(__IAVF_IN_REMOVE_TASK, &adapter->crit_section) && + !(adapter->flags & + (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED))) { + adapter->flags |= flags; queue_work(adapter->wq, &adapter->reset_task); } } @@ -334,7 +336,7 @@ static void iavf_tx_timeout(struct net_device *netdev, unsigned int txqueue) struct iavf_adapter *adapter = netdev_priv(netdev); adapter->tx_timeout_count++; - iavf_schedule_reset(adapter); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_NEEDED); } /** @@ -2478,7 +2480,7 @@ int iavf_parse_vf_resource_msg(struct iavf_adapter *adapter) adapter->vsi_res->num_queue_pairs); adapter->flags |= IAVF_FLAG_REINIT_MSIX_NEEDED; adapter->num_req_queues = adapter->vsi_res->num_queue_pairs; - iavf_schedule_reset(adapter); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_NEEDED); return -EAGAIN; } @@ -2775,14 +2777,6 @@ static void iavf_watchdog_task(struct work_struct *work) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) iavf_change_state(adapter, __IAVF_COMM_FAILED); - if (adapter->flags & IAVF_FLAG_RESET_NEEDED) { - adapter->aq_required = 0; - adapter->current_op = VIRTCHNL_OP_UNKNOWN; - mutex_unlock(&adapter->crit_lock); - queue_work(adapter->wq, &adapter->reset_task); - return; - } - switch (adapter->state) { case __IAVF_STARTUP: iavf_startup(adapter); @@ -2910,11 +2904,10 @@ static void iavf_watchdog_task(struct work_struct *work) /* check for hw reset */ reg_val = rd32(hw, IAVF_VF_ARQLEN1) & IAVF_VF_ARQLEN1_ARQENABLE_MASK; if (!reg_val) { - adapter->flags |= IAVF_FLAG_RESET_PENDING; adapter->aq_required = 0; adapter->current_op = VIRTCHNL_OP_UNKNOWN; dev_err(&adapter->pdev->dev, "Hardware reset detected\n"); - queue_work(adapter->wq, &adapter->reset_task); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_PENDING); mutex_unlock(&adapter->crit_lock); queue_delayed_work(adapter->wq, &adapter->watchdog_task, HZ * 2); @@ -3288,9 +3281,7 @@ static void iavf_adminq_task(struct work_struct *work) } while (pending); mutex_unlock(&adapter->crit_lock); - if ((adapter->flags & - (IAVF_FLAG_RESET_PENDING | IAVF_FLAG_RESET_NEEDED)) || - adapter->state == __IAVF_RESETTING) + if (iavf_is_reset_in_progress(adapter)) goto freedom; /* check for error indications */ @@ -4387,8 +4378,7 @@ static int iavf_change_mtu(struct net_device *netdev, int new_mtu) } if (netif_running(netdev)) { - adapter->flags |= IAVF_FLAG_RESET_NEEDED; - queue_work(adapter->wq, &adapter->reset_task); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_NEEDED); ret = iavf_wait_for_reset(adapter); if (ret < 0) netdev_warn(netdev, "MTU change interrupted waiting for reset"); diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c index 073ac29ed84c..be3c007ce90a 100644 --- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c @@ -1961,9 +1961,8 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, case VIRTCHNL_EVENT_RESET_IMPENDING: dev_info(&adapter->pdev->dev, "Reset indication received from the PF\n"); if (!(adapter->flags & IAVF_FLAG_RESET_PENDING)) { - adapter->flags |= IAVF_FLAG_RESET_PENDING; dev_info(&adapter->pdev->dev, "Scheduling reset task\n"); - queue_work(adapter->wq, &adapter->reset_task); + iavf_schedule_reset(adapter, IAVF_FLAG_RESET_PENDING); } break; default: -- cgit v1.2.3 From 9efa1a5407e81265ea502cab83be4de503decc49 Mon Sep 17 00:00:00 2001 From: Fedor Ross Date: Thu, 4 May 2023 21:50:59 +0200 Subject: can: mcp251xfd: __mcp251xfd_chip_set_mode(): increase poll timeout The mcp251xfd controller needs an idle bus to enter 'Normal CAN 2.0 mode' or . The maximum length of a CAN frame is 736 bits (64 data bytes, CAN-FD, EFF mode, worst case bit stuffing and interframe spacing). For low bit rates like 10 kbit/s the arbitrarily chosen MCP251XFD_POLL_TIMEOUT_US of 1 ms is too small. Otherwise during polling for the CAN controller to enter 'Normal CAN 2.0 mode' the timeout limit is exceeded and the configuration fails with: | $ ip link set dev can1 up type can bitrate 10000 | [ 731.911072] mcp251xfd spi2.1 can1: Controller failed to enter mode CAN 2.0 Mode (6) and stays in Configuration Mode (4) (con=0x068b0760, osc=0x00000468). | [ 731.927192] mcp251xfd spi2.1 can1: CRC read error at address 0x0e0c (length=4, data=00 00 00 00, CRC=0x0000) retrying. | [ 731.938101] A link change request failed with some changes committed already. Interface can1 may have been left with an inconsistent configuration, please check. | RTNETLINK answers: Connection timed out Make MCP251XFD_POLL_TIMEOUT_US timeout calculation dynamic. Use maximum of 1ms and bit time of 1 full 64 data bytes CAN-FD frame in EFF mode, worst case bit stuffing and interframe spacing at the current bit rate. For easier backporting define the macro MCP251XFD_FRAME_LEN_MAX_BITS that holds the max frame length in bits, which is 736. This can be replaced by can_frame_bits(true, true, true, true, CANFD_MAX_DLEN) in a cleanup patch later. Fixes: 55e5b97f003e8 ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Signed-off-by: Fedor Ross Signed-off-by: Marek Vasut Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20230717-mcp251xfd-fix-increase-poll-timeout-v5-1-06600f34c684@pengutronix.de Signed-off-by: Marc Kleine-Budde --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 10 ++++++++-- drivers/net/can/spi/mcp251xfd/mcp251xfd.h | 1 + 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 68df6d4641b5..eebf967f4711 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -227,6 +227,8 @@ static int __mcp251xfd_chip_set_mode(const struct mcp251xfd_priv *priv, const u8 mode_req, bool nowait) { + const struct can_bittiming *bt = &priv->can.bittiming; + unsigned long timeout_us = MCP251XFD_POLL_TIMEOUT_US; u32 con = 0, con_reqop, osc = 0; u8 mode; int err; @@ -246,12 +248,16 @@ __mcp251xfd_chip_set_mode(const struct mcp251xfd_priv *priv, if (mode_req == MCP251XFD_REG_CON_MODE_SLEEP || nowait) return 0; + if (bt->bitrate) + timeout_us = max_t(unsigned long, timeout_us, + MCP251XFD_FRAME_LEN_MAX_BITS * USEC_PER_SEC / + bt->bitrate); + err = regmap_read_poll_timeout(priv->map_reg, MCP251XFD_REG_CON, con, !mcp251xfd_reg_invalid(con) && FIELD_GET(MCP251XFD_REG_CON_OPMOD_MASK, con) == mode_req, - MCP251XFD_POLL_SLEEP_US, - MCP251XFD_POLL_TIMEOUT_US); + MCP251XFD_POLL_SLEEP_US, timeout_us); if (err != -ETIMEDOUT && err != -EBADMSG) return err; diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd.h b/drivers/net/can/spi/mcp251xfd/mcp251xfd.h index 7024ff0cc2c0..24510b3b8020 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd.h +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd.h @@ -387,6 +387,7 @@ static_assert(MCP251XFD_TIMESTAMP_WORK_DELAY_SEC < #define MCP251XFD_OSC_STAB_TIMEOUT_US (10 * MCP251XFD_OSC_STAB_SLEEP_US) #define MCP251XFD_POLL_SLEEP_US (10) #define MCP251XFD_POLL_TIMEOUT_US (USEC_PER_MSEC) +#define MCP251XFD_FRAME_LEN_MAX_BITS (736) /* Misc */ #define MCP251XFD_NAPI_WEIGHT 32 -- cgit v1.2.3 From f4032d615f90970d6c3ac1d9c0bce3351eb4445c Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Tue, 16 May 2023 01:25:54 +0300 Subject: tpm: tpm_vtpm_proxy: fix a race condition in /dev/vtpmx creation /dev/vtpmx is made visible before 'workqueue' is initialized, which can lead to a memory corruption in the worst case scenario. Address this by initializing 'workqueue' as the very first step of the driver initialization. Cc: stable@vger.kernel.org Fixes: 6f99612e2500 ("tpm: Proxy driver for supporting multiple emulated TPMs") Reviewed-by: Stefan Berger Signed-off-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_vtpm_proxy.c | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/drivers/char/tpm/tpm_vtpm_proxy.c b/drivers/char/tpm/tpm_vtpm_proxy.c index 5c865987ba5c..30e953988cab 100644 --- a/drivers/char/tpm/tpm_vtpm_proxy.c +++ b/drivers/char/tpm/tpm_vtpm_proxy.c @@ -683,37 +683,21 @@ static struct miscdevice vtpmx_miscdev = { .fops = &vtpmx_fops, }; -static int vtpmx_init(void) -{ - return misc_register(&vtpmx_miscdev); -} - -static void vtpmx_cleanup(void) -{ - misc_deregister(&vtpmx_miscdev); -} - static int __init vtpm_module_init(void) { int rc; - rc = vtpmx_init(); - if (rc) { - pr_err("couldn't create vtpmx device\n"); - return rc; - } - workqueue = create_workqueue("tpm-vtpm"); if (!workqueue) { pr_err("couldn't create workqueue\n"); - rc = -ENOMEM; - goto err_vtpmx_cleanup; + return -ENOMEM; } - return 0; - -err_vtpmx_cleanup: - vtpmx_cleanup(); + rc = misc_register(&vtpmx_miscdev); + if (rc) { + pr_err("couldn't create vtpmx device\n"); + destroy_workqueue(workqueue); + } return rc; } @@ -721,7 +705,7 @@ err_vtpmx_cleanup: static void __exit vtpm_module_exit(void) { destroy_workqueue(workqueue); - vtpmx_cleanup(); + misc_deregister(&vtpmx_miscdev); } module_init(vtpm_module_init); -- cgit v1.2.3 From edb13d7bb034c4d5523f15e9aeea31c504af6f91 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Wed, 17 May 2023 15:29:31 +0300 Subject: tpm: tpm_tis: Disable interrupts *only* for AEON UPX-i11 Further restrict with DMI_PRODUCT_VERSION. Cc: stable@vger.kernel.org # v6.4+ Link: https://lore.kernel.org/linux-integrity/20230517122931.22385-1-peter.ujfalusi@linux.intel.com/ Fixes: 95a9359ee22f ("tpm: tpm_tis: Disable interrupts for AEON UPX-i11") Signed-off-by: Peter Ujfalusi Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 7db3593941ea..9cb4e81fc548 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -143,6 +143,7 @@ static const struct dmi_system_id tpm_tis_dmi_table[] = { .ident = "UPX-TGL", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "AAEON"), + DMI_MATCH(DMI_PRODUCT_VERSION, "UPX-TGL"), }, }, {} -- cgit v1.2.3 From f0afba4042bd902d290405c0638bc1295872d8a7 Mon Sep 17 00:00:00 2001 From: Peijie Shao Date: Tue, 23 May 2023 10:45:36 +0800 Subject: tpm_tis_spi: Release chip select when flow control fails The failure paths in tpm_tis_spi_transfer() do not deactivate chip select. Send an empty message (cs_select == 0) to overcome this. The patch is tested by two ways. One way needs to touch hardware: 1. force pull MISO pin down to GND, it emulates a forever 'WAIT' timing. 2. probe cs pin by an oscilloscope. 3. load tpm_tis_spi.ko. After loading, dmesg prints: "probe of spi0.0 failed with error -110" and oscilloscope shows cs pin goes high(deactivated) after the failure. Before the patch, cs pin keeps low. Second way is by writing a fake spi controller. 1. implement .transfer_one method, fill all rx buf with 0. 2. implement .set_cs method, print the state of cs pin. we can see cs goes high after the failure. Signed-off-by: Peijie Shao Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_spi_main.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis_spi_main.c b/drivers/char/tpm/tpm_tis_spi_main.c index 1f5207974a17..9bfaba092a06 100644 --- a/drivers/char/tpm/tpm_tis_spi_main.c +++ b/drivers/char/tpm/tpm_tis_spi_main.c @@ -136,6 +136,14 @@ int tpm_tis_spi_transfer(struct tpm_tis_data *data, u32 addr, u16 len, } exit: + if (ret < 0) { + /* Deactivate chip select */ + memset(&spi_xfer, 0, sizeof(spi_xfer)); + spi_message_init(&m); + spi_message_add_tail(&spi_xfer, &m); + spi_sync_locked(phy->spi_device, &m); + } + spi_bus_unlock(phy->spi_device->master); return ret; } -- cgit v1.2.3 From f3b70b6e3390bfdf18fdd7d278a72a12784fdcce Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Wed, 24 May 2023 17:40:39 +0200 Subject: tpm: tis_i2c: Limit read bursts to I2C_SMBUS_BLOCK_MAX (32) bytes Underlying I2C bus drivers not always support longer transfers and imx-lpi2c for instance doesn't. SLB 9673 offers 427-bytes packets. Visible symptoms are: tpm tpm0: Error left over data tpm tpm0: tpm_transmit: tpm_recv: error -5 tpm_tis_i2c: probe of 1-002e failed with error -5 Cc: stable@vger.kernel.org # v5.20+ Fixes: bbc23a07b072 ("tpm: Add tpm_tis_i2c backend for tpm_tis_core") Tested-by: Michael Haener Signed-off-by: Alexander Sverdlin Reviewed-by: Jarkko Sakkinen Reviewed-by: Jerry Snitselaar Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_i2c.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c index c8c34adc14c0..106fd20d94e4 100644 --- a/drivers/char/tpm/tpm_tis_i2c.c +++ b/drivers/char/tpm/tpm_tis_i2c.c @@ -189,21 +189,28 @@ static int tpm_tis_i2c_read_bytes(struct tpm_tis_data *data, u32 addr, u16 len, int ret; for (i = 0; i < TPM_RETRY; i++) { - /* write register */ - msg.len = sizeof(reg); - msg.buf = ® - msg.flags = 0; - ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); - if (ret < 0) - return ret; - - /* read data */ - msg.buf = result; - msg.len = len; - msg.flags = I2C_M_RD; - ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); - if (ret < 0) - return ret; + u16 read = 0; + + while (read < len) { + /* write register */ + msg.len = sizeof(reg); + msg.buf = ® + msg.flags = 0; + ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); + if (ret < 0) + return ret; + + /* read data */ + msg.buf = result + read; + msg.len = len - read; + msg.flags = I2C_M_RD; + if (msg.len > I2C_SMBUS_BLOCK_MAX) + msg.len = I2C_SMBUS_BLOCK_MAX; + ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); + if (ret < 0) + return ret; + read += msg.len; + } ret = tpm_tis_i2c_sanity_check_read(reg, len, result); if (ret == 0) -- cgit v1.2.3 From 83e7e5d89f04d1c417492940f7922bc8416a8cc4 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Wed, 24 May 2023 17:40:40 +0200 Subject: tpm: tis_i2c: Limit write bursts to I2C_SMBUS_BLOCK_MAX (32) bytes Underlying I2C bus drivers not always support longer transfers and imx-lpi2c for instance doesn't. The fix is symmetric to previous patch which fixed the read direction. Cc: stable@vger.kernel.org # v5.20+ Fixes: bbc23a07b072 ("tpm: Add tpm_tis_i2c backend for tpm_tis_core") Tested-by: Michael Haener Signed-off-by: Alexander Sverdlin Reviewed-by: Jarkko Sakkinen Reviewed-by: Jerry Snitselaar Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_i2c.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_i2c.c b/drivers/char/tpm/tpm_tis_i2c.c index 106fd20d94e4..82fda488e98b 100644 --- a/drivers/char/tpm/tpm_tis_i2c.c +++ b/drivers/char/tpm/tpm_tis_i2c.c @@ -230,19 +230,27 @@ static int tpm_tis_i2c_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, struct i2c_msg msg = { .addr = phy->i2c_client->addr }; u8 reg = tpm_tis_i2c_address_to_register(addr); int ret; + u16 wrote = 0; if (len > TPM_BUFSIZE - 1) return -EIO; - /* write register and data in one go */ phy->io_buf[0] = reg; - memcpy(phy->io_buf + sizeof(reg), value, len); - - msg.len = sizeof(reg) + len; msg.buf = phy->io_buf; - ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); - if (ret < 0) - return ret; + while (wrote < len) { + /* write register and data in one go */ + msg.len = sizeof(reg) + len - wrote; + if (msg.len > I2C_SMBUS_BLOCK_MAX) + msg.len = I2C_SMBUS_BLOCK_MAX; + + memcpy(phy->io_buf + sizeof(reg), value + wrote, + msg.len - sizeof(reg)); + + ret = tpm_tis_i2c_retry_transfer_until_ack(data, &msg); + if (ret < 0) + return ret; + wrote += msg.len - sizeof(reg); + } return 0; } -- cgit v1.2.3 From d55901522f96082a43b9842d34867363c0cdbac5 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Thu, 23 Mar 2023 14:04:12 +0100 Subject: keys: Fix linking a duplicate key to a keyring's assoc_array When making a DNS query inside the kernel using dns_query(), the request code can in rare cases end up creating a duplicate index key in the assoc_array of the destination keyring. It is eventually found by a BUG_ON() check in the assoc_array implementation and results in a crash. Example report: [2158499.700025] kernel BUG at ../lib/assoc_array.c:652! [2158499.700039] invalid opcode: 0000 [#1] SMP PTI [2158499.700065] CPU: 3 PID: 31985 Comm: kworker/3:1 Kdump: loaded Not tainted 5.3.18-150300.59.90-default #1 SLE15-SP3 [2158499.700096] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 11/12/2020 [2158499.700351] Workqueue: cifsiod cifs_resolve_server [cifs] [2158499.700380] RIP: 0010:assoc_array_insert+0x85f/0xa40 [2158499.700401] Code: ff 74 2b 48 8b 3b 49 8b 45 18 4c 89 e6 48 83 e7 fe e8 95 ec 74 00 3b 45 88 7d db 85 c0 79 d4 0f 0b 0f 0b 0f 0b e8 41 f2 be ff <0f> 0b 0f 0b 81 7d 88 ff ff ff 7f 4c 89 eb 4c 8b ad 58 ff ff ff 0f [2158499.700448] RSP: 0018:ffffc0bd6187faf0 EFLAGS: 00010282 [2158499.700470] RAX: ffff9f1ea7da2fe8 RBX: ffff9f1ea7da2fc1 RCX: 0000000000000005 [2158499.700492] RDX: 0000000000000000 RSI: 0000000000000005 RDI: 0000000000000000 [2158499.700515] RBP: ffffc0bd6187fbb0 R08: ffff9f185faf1100 R09: 0000000000000000 [2158499.700538] R10: ffff9f1ea7da2cc0 R11: 000000005ed8cec8 R12: ffffc0bd6187fc28 [2158499.700561] R13: ffff9f15feb8d000 R14: ffff9f1ea7da2fc0 R15: ffff9f168dc0d740 [2158499.700585] FS: 0000000000000000(0000) GS:ffff9f185fac0000(0000) knlGS:0000000000000000 [2158499.700610] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [2158499.700630] CR2: 00007fdd94fca238 CR3: 0000000809d8c006 CR4: 00000000003706e0 [2158499.700702] Call Trace: [2158499.700741] ? key_alloc+0x447/0x4b0 [2158499.700768] ? __key_link_begin+0x43/0xa0 [2158499.700790] __key_link_begin+0x43/0xa0 [2158499.700814] request_key_and_link+0x2c7/0x730 [2158499.700847] ? dns_resolver_read+0x20/0x20 [dns_resolver] [2158499.700873] ? key_default_cmp+0x20/0x20 [2158499.700898] request_key_tag+0x43/0xa0 [2158499.700926] dns_query+0x114/0x2ca [dns_resolver] [2158499.701127] dns_resolve_server_name_to_ip+0x194/0x310 [cifs] [2158499.701164] ? scnprintf+0x49/0x90 [2158499.701190] ? __switch_to_asm+0x40/0x70 [2158499.701211] ? __switch_to_asm+0x34/0x70 [2158499.701405] reconn_set_ipaddr_from_hostname+0x81/0x2a0 [cifs] [2158499.701603] cifs_resolve_server+0x4b/0xd0 [cifs] [2158499.701632] process_one_work+0x1f8/0x3e0 [2158499.701658] worker_thread+0x2d/0x3f0 [2158499.701682] ? process_one_work+0x3e0/0x3e0 [2158499.701703] kthread+0x10d/0x130 [2158499.701723] ? kthread_park+0xb0/0xb0 [2158499.701746] ret_from_fork+0x1f/0x40 The situation occurs as follows: * Some kernel facility invokes dns_query() to resolve a hostname, for example, "abcdef". The function registers its global DNS resolver cache as current->cred.thread_keyring and passes the query to request_key_net() -> request_key_tag() -> request_key_and_link(). * Function request_key_and_link() creates a keyring_search_context object. Its match_data.cmp method gets set via a call to type->match_preparse() (resolves to dns_resolver_match_preparse()) to dns_resolver_cmp(). * Function request_key_and_link() continues and invokes search_process_keyrings_rcu() which returns that a given key was not found. The control is then passed to request_key_and_link() -> construct_alloc_key(). * Concurrently to that, a second task similarly makes a DNS query for "abcdef." and its result gets inserted into the DNS resolver cache. * Back on the first task, function construct_alloc_key() first runs __key_link_begin() to determine an assoc_array_edit operation to insert a new key. Index keys in the array are compared exactly as-is, using keyring_compare_object(). The operation finds that "abcdef" is not yet present in the destination keyring. * Function construct_alloc_key() continues and checks if a given key is already present on some keyring by again calling search_process_keyrings_rcu(). This search is done using dns_resolver_cmp() and "abcdef" gets matched with now present key "abcdef.". * The found key is linked on the destination keyring by calling __key_link() and using the previously calculated assoc_array_edit operation. This inserts the "abcdef." key in the array but creates a duplicity because the same index key is already present. Fix the problem by postponing __key_link_begin() in construct_alloc_key() until an actual key which should be linked into the destination keyring is determined. [jarkko@kernel.org: added a fixes tag and cc to stable] Cc: stable@vger.kernel.org # v5.3+ Fixes: df593ee23e05 ("keys: Hoist locking out of __key_link_begin()") Signed-off-by: Petr Pavlu Reviewed-by: Joey Lee Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- security/keys/request_key.c | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/security/keys/request_key.c b/security/keys/request_key.c index 07a0ef2baacd..a7673ad86d18 100644 --- a/security/keys/request_key.c +++ b/security/keys/request_key.c @@ -401,17 +401,21 @@ static int construct_alloc_key(struct keyring_search_context *ctx, set_bit(KEY_FLAG_USER_CONSTRUCT, &key->flags); if (dest_keyring) { - ret = __key_link_lock(dest_keyring, &ctx->index_key); + ret = __key_link_lock(dest_keyring, &key->index_key); if (ret < 0) goto link_lock_failed; - ret = __key_link_begin(dest_keyring, &ctx->index_key, &edit); - if (ret < 0) - goto link_prealloc_failed; } - /* attach the key to the destination keyring under lock, but we do need + /* + * Attach the key to the destination keyring under lock, but we do need * to do another check just in case someone beat us to it whilst we - * waited for locks */ + * waited for locks. + * + * The caller might specify a comparison function which looks for keys + * that do not exactly match but are still equivalent from the caller's + * perspective. The __key_link_begin() operation must be done only after + * an actual key is determined. + */ mutex_lock(&key_construction_mutex); rcu_read_lock(); @@ -420,12 +424,16 @@ static int construct_alloc_key(struct keyring_search_context *ctx, if (!IS_ERR(key_ref)) goto key_already_present; - if (dest_keyring) + if (dest_keyring) { + ret = __key_link_begin(dest_keyring, &key->index_key, &edit); + if (ret < 0) + goto link_alloc_failed; __key_link(dest_keyring, key, &edit); + } mutex_unlock(&key_construction_mutex); if (dest_keyring) - __key_link_end(dest_keyring, &ctx->index_key, edit); + __key_link_end(dest_keyring, &key->index_key, edit); mutex_unlock(&user->cons_lock); *_key = key; kleave(" = 0 [%d]", key_serial(key)); @@ -438,10 +446,13 @@ key_already_present: mutex_unlock(&key_construction_mutex); key = key_ref_to_ptr(key_ref); if (dest_keyring) { + ret = __key_link_begin(dest_keyring, &key->index_key, &edit); + if (ret < 0) + goto link_alloc_failed_unlocked; ret = __key_link_check_live_key(dest_keyring, key); if (ret == 0) __key_link(dest_keyring, key, &edit); - __key_link_end(dest_keyring, &ctx->index_key, edit); + __key_link_end(dest_keyring, &key->index_key, edit); if (ret < 0) goto link_check_failed; } @@ -456,8 +467,10 @@ link_check_failed: kleave(" = %d [linkcheck]", ret); return ret; -link_prealloc_failed: - __key_link_end(dest_keyring, &ctx->index_key, edit); +link_alloc_failed: + mutex_unlock(&key_construction_mutex); +link_alloc_failed_unlocked: + __key_link_end(dest_keyring, &key->index_key, edit); link_lock_failed: mutex_unlock(&user->cons_lock); key_put(key); -- cgit v1.2.3 From ecff6813d2bcf0c670881a9ba3f51cb032dd405a Mon Sep 17 00:00:00 2001 From: Jerry Snitselaar Date: Thu, 29 Jun 2023 13:41:47 -0700 Subject: tpm: return false from tpm_amd_is_rng_defective on non-x86 platforms tpm_amd_is_rng_defective is for dealing with an issue related to the AMD firmware TPM, so on non-x86 architectures just have it inline and return false. Cc: stable@vger.kernel.org # v6.3+ Reported-by: Sachin Sant Reported-by: Aneesh Kumar K. V Closes: https://lore.kernel.org/lkml/99B81401-DB46-49B9-B321-CF832B50CAC3@linux.ibm.com/ Fixes: f1324bbc4011 ("tpm: disable hwrng for fTPM on some AMD designs") Signed-off-by: Jerry Snitselaar Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm-chip.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/char/tpm/tpm-chip.c b/drivers/char/tpm/tpm-chip.c index cd48033b804a..cf5499e51999 100644 --- a/drivers/char/tpm/tpm-chip.c +++ b/drivers/char/tpm/tpm-chip.c @@ -518,6 +518,7 @@ static int tpm_add_legacy_sysfs(struct tpm_chip *chip) * 6.x.y.z series: 6.0.18.6 + * 3.x.y.z series: 3.57.y.5 + */ +#ifdef CONFIG_X86 static bool tpm_amd_is_rng_defective(struct tpm_chip *chip) { u32 val1, val2; @@ -566,6 +567,12 @@ release: return true; } +#else +static inline bool tpm_amd_is_rng_defective(struct tpm_chip *chip) +{ + return false; +} +#endif /* CONFIG_X86 */ static int tpm_hwrng_read(struct hwrng *rng, void *data, size_t max, bool wait) { -- cgit v1.2.3 From 2a4152742025c5f21482e8cebc581702a0fa5b01 Mon Sep 17 00:00:00 2001 From: Jiapeng Chong Date: Wed, 14 Jun 2023 10:18:25 +0800 Subject: security: keys: Modify mismatched function name No functional modification involved. security/keys/trusted-keys/trusted_tpm2.c:203: warning: expecting prototype for tpm_buf_append_auth(). Prototype was for tpm2_buf_append_auth() instead. Fixes: 2e19e10131a0 ("KEYS: trusted: Move TPM2 trusted keys code") Reported-by: Abaci Robot Closes: https://bugzilla.openanolis.cn/show_bug.cgi?id=5524 Signed-off-by: Jiapeng Chong Reviewed-by: Paul Moore Signed-off-by: Jarkko Sakkinen --- security/keys/trusted-keys/trusted_tpm2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/keys/trusted-keys/trusted_tpm2.c b/security/keys/trusted-keys/trusted_tpm2.c index 2b2c8eb258d5..bc700f85f80b 100644 --- a/security/keys/trusted-keys/trusted_tpm2.c +++ b/security/keys/trusted-keys/trusted_tpm2.c @@ -186,7 +186,7 @@ int tpm2_key_priv(void *context, size_t hdrlen, } /** - * tpm_buf_append_auth() - append TPMS_AUTH_COMMAND to the buffer. + * tpm2_buf_append_auth() - append TPMS_AUTH_COMMAND to the buffer. * * @buf: an allocated tpm_buf instance * @session_handle: session handle -- cgit v1.2.3 From 08b0af4478bacb8bb701c172c99a34ea32da89f5 Mon Sep 17 00:00:00 2001 From: Christian Hesse Date: Mon, 10 Jul 2023 23:16:09 +0200 Subject: tpm/tpm_tis: Disable interrupts for Framework Laptop Intel 12th gen This device suffer an irq storm, so add it in tpm_tis_dmi_table to force polling. Cc: stable@vger.kernel.org # v6.4+ Link: https://community.frame.work/t/boot-and-shutdown-hangs-with-arch-linux-kernel-6-4-1-mainline-and-arch/33118 Fixes: e644b2f498d2 ("tpm, tpm_tis: Enable interrupt test") Reported-by: Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217631 Signed-off-by: Christian Hesse Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 9cb4e81fc548..5dd391ed3320 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -114,6 +114,14 @@ static int tpm_tis_disable_irq(const struct dmi_system_id *d) } static const struct dmi_system_id tpm_tis_dmi_table[] = { + { + .callback = tpm_tis_disable_irq, + .ident = "Framework Laptop (12th Gen Intel Core)", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Framework"), + DMI_MATCH(DMI_PRODUCT_NAME, "Laptop (12th Gen Intel Core)"), + }, + }, { .callback = tpm_tis_disable_irq, .ident = "ThinkPad T490s", -- cgit v1.2.3 From bc825e851c2fe89c127cac1e0e5cf344c4940619 Mon Sep 17 00:00:00 2001 From: Christian Hesse Date: Mon, 10 Jul 2023 23:16:10 +0200 Subject: tpm/tpm_tis: Disable interrupts for Framework Laptop Intel 13th gen This device suffer an irq storm, so add it in tpm_tis_dmi_table to force polling. Cc: stable@vger.kernel.org # v6.4+ Link: https://community.frame.work/t/boot-and-shutdown-hangs-with-arch-linux-kernel-6-4-1-mainline-and-arch/33118 Fixes: e644b2f498d2 ("tpm, tpm_tis: Enable interrupt test") Reported-by: Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217631 Signed-off-by: Christian Hesse Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 5dd391ed3320..4e4426965cd0 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -122,6 +122,14 @@ static const struct dmi_system_id tpm_tis_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Laptop (12th Gen Intel Core)"), }, }, + { + .callback = tpm_tis_disable_irq, + .ident = "Framework Laptop (13th Gen Intel Core)", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Framework"), + DMI_MATCH(DMI_PRODUCT_NAME, "Laptop (13th Gen Intel Core)"), + }, + }, { .callback = tpm_tis_disable_irq, .ident = "ThinkPad T490s", -- cgit v1.2.3 From b1c1b98962d17a922989aa3b2822946bbb5c091f Mon Sep 17 00:00:00 2001 From: Valentin David Date: Mon, 10 Jul 2023 22:27:49 +0200 Subject: tpm: Do not remap from ACPI resources again for Pluton TPM For Pluton TPM devices, it was assumed that there was no ACPI memory regions. This is not true for ASUS ROG Ally. ACPI advertises 0xfd500000-0xfd5fffff. Since remapping is already done in `crb_map_pluton`, remapping again in `crb_map_io` causes EBUSY error: [ 3.510453] tpm_crb MSFT0101:00: can't request region for resource [mem 0xfd500000-0xfd5fffff] [ 3.510463] tpm_crb: probe of MSFT0101:00 failed with error -16 Cc: stable@vger.kernel.org # v6.3+ Fixes: 4d2732882703 ("tpm_crb: Add support for CRB devices based on Pluton") Signed-off-by: Valentin David Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_crb.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/char/tpm/tpm_crb.c b/drivers/char/tpm/tpm_crb.c index d43a0d7b97a8..1a5d09b18513 100644 --- a/drivers/char/tpm/tpm_crb.c +++ b/drivers/char/tpm/tpm_crb.c @@ -563,15 +563,18 @@ static int crb_map_io(struct acpi_device *device, struct crb_priv *priv, u32 rsp_size; int ret; - INIT_LIST_HEAD(&acpi_resource_list); - ret = acpi_dev_get_resources(device, &acpi_resource_list, - crb_check_resource, iores_array); - if (ret < 0) - return ret; - acpi_dev_free_resource_list(&acpi_resource_list); - - /* Pluton doesn't appear to define ACPI memory regions */ + /* + * Pluton sometimes does not define ACPI memory regions. + * Mapping is then done in crb_map_pluton + */ if (priv->sm != ACPI_TPM2_COMMAND_BUFFER_WITH_PLUTON) { + INIT_LIST_HEAD(&acpi_resource_list); + ret = acpi_dev_get_resources(device, &acpi_resource_list, + crb_check_resource, iores_array); + if (ret < 0) + return ret; + acpi_dev_free_resource_list(&acpi_resource_list); + if (resource_type(iores_array) != IORESOURCE_MEM) { dev_err(dev, FW_BUG "TPM2 ACPI table does not define a memory resource\n"); return -EINVAL; -- cgit v1.2.3 From 393f362389cecc2e4f2e3520a6c8ee9dbb1e3d15 Mon Sep 17 00:00:00 2001 From: Florian Bezdeka Date: Tue, 20 Jun 2023 13:11:01 +0200 Subject: tpm/tpm_tis: Disable interrupts for Lenovo L590 devices The Lenovo L590 suffers from an irq storm issue like the T490, T490s and P360 Tiny, so add an entry for it to tpm_tis_dmi_table and force polling. Cc: stable@vger.kernel.org # v6.4+ Link: https://bugzilla.redhat.com/show_bug.cgi?id=2214069#c0 Fixes: e644b2f498d2 ("tpm, tpm_tis: Enable interrupt test") Signed-off-by: Florian Bezdeka Reviewed-by: Jerry Snitselaar Reviewed-by: Hans de Goede Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 4e4426965cd0..cc42cf3de960 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -154,6 +154,14 @@ static const struct dmi_system_id tpm_tis_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad L490"), }, }, + { + .callback = tpm_tis_disable_irq, + .ident = "ThinkPad L590", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), + DMI_MATCH(DMI_PRODUCT_VERSION, "ThinkPad L590"), + }, + }, { .callback = tpm_tis_disable_irq, .ident = "UPX-TGL", -- cgit v1.2.3 From 481c2d14627de8ecbb54dd125466e4b4a5069b47 Mon Sep 17 00:00:00 2001 From: Lino Sanfilippo Date: Thu, 13 Jul 2023 21:01:08 +0200 Subject: tpm,tpm_tis: Disable interrupts after 1000 unhandled IRQs After activation of interrupts for TPM TIS drivers 0-day reports an interrupt storm on an Inspur NF5180M6 server. Fix this by detecting the storm and falling back to polling: Count the number of unhandled interrupts within a 10 ms time interval. In case that more than 1000 were unhandled deactivate interrupts entirely, deregister the handler and use polling instead. Also print a note to point to the tpm_tis_dmi_table. Since the interrupt deregistration function devm_free_irq() waits for all interrupt handlers to finish, only trigger a worker in the interrupt handler and do the unregistration in the worker to avoid a deadlock. Note: the storm detection logic equals the implementation in note_interrupt() which uses timestamps and counters stored in struct irq_desc. Since this structure is private to the generic interrupt core the TPM TIS core uses its own timestamps and counters. Furthermore the TPM interrupt handler always returns IRQ_HANDLED to prevent the generic interrupt core from processing the interrupt storm. Cc: stable@vger.kernel.org # v6.4+ Fixes: e644b2f498d2 ("tpm, tpm_tis: Enable interrupt test") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202305041325.ae8b0c43-yujie.liu@intel.com/ Suggested-by: Lukas Wunner Signed-off-by: Lino Sanfilippo Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen --- drivers/char/tpm/tpm_tis_core.c | 103 ++++++++++++++++++++++++++++++++++------ drivers/char/tpm/tpm_tis_core.h | 4 ++ 2 files changed, 92 insertions(+), 15 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_core.c b/drivers/char/tpm/tpm_tis_core.c index 558144fa707a..88a5384c09c0 100644 --- a/drivers/char/tpm/tpm_tis_core.c +++ b/drivers/char/tpm/tpm_tis_core.c @@ -24,9 +24,12 @@ #include #include #include +#include #include "tpm.h" #include "tpm_tis_core.h" +#define TPM_TIS_MAX_UNHANDLED_IRQS 1000 + static void tpm_tis_clkrun_enable(struct tpm_chip *chip, bool value); static bool wait_for_tpm_stat_cond(struct tpm_chip *chip, u8 mask, @@ -468,25 +471,29 @@ out_err: return rc; } -static void disable_interrupts(struct tpm_chip *chip) +static void __tpm_tis_disable_interrupts(struct tpm_chip *chip) +{ + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); + u32 int_mask = 0; + + tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &int_mask); + int_mask &= ~TPM_GLOBAL_INT_ENABLE; + tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), int_mask); + + chip->flags &= ~TPM_CHIP_FLAG_IRQ; +} + +static void tpm_tis_disable_interrupts(struct tpm_chip *chip) { struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); - u32 intmask; - int rc; if (priv->irq == 0) return; - rc = tpm_tis_read32(priv, TPM_INT_ENABLE(priv->locality), &intmask); - if (rc < 0) - intmask = 0; - - intmask &= ~TPM_GLOBAL_INT_ENABLE; - rc = tpm_tis_write32(priv, TPM_INT_ENABLE(priv->locality), intmask); + __tpm_tis_disable_interrupts(chip); devm_free_irq(chip->dev.parent, priv->irq, chip); priv->irq = 0; - chip->flags &= ~TPM_CHIP_FLAG_IRQ; } /* @@ -552,7 +559,7 @@ static int tpm_tis_send(struct tpm_chip *chip, u8 *buf, size_t len) if (!test_bit(TPM_TIS_IRQ_TESTED, &priv->flags)) tpm_msleep(1); if (!test_bit(TPM_TIS_IRQ_TESTED, &priv->flags)) - disable_interrupts(chip); + tpm_tis_disable_interrupts(chip); set_bit(TPM_TIS_IRQ_TESTED, &priv->flags); return rc; } @@ -752,6 +759,57 @@ static bool tpm_tis_req_canceled(struct tpm_chip *chip, u8 status) return status == TPM_STS_COMMAND_READY; } +static irqreturn_t tpm_tis_revert_interrupts(struct tpm_chip *chip) +{ + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); + const char *product; + const char *vendor; + + dev_warn(&chip->dev, FW_BUG + "TPM interrupt storm detected, polling instead\n"); + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + product = dmi_get_system_info(DMI_PRODUCT_VERSION); + + if (vendor && product) { + dev_info(&chip->dev, + "Consider adding the following entry to tpm_tis_dmi_table:\n"); + dev_info(&chip->dev, "\tDMI_SYS_VENDOR: %s\n", vendor); + dev_info(&chip->dev, "\tDMI_PRODUCT_VERSION: %s\n", product); + } + + if (tpm_tis_request_locality(chip, 0) != 0) + return IRQ_NONE; + + __tpm_tis_disable_interrupts(chip); + tpm_tis_relinquish_locality(chip, 0); + + schedule_work(&priv->free_irq_work); + + return IRQ_HANDLED; +} + +static irqreturn_t tpm_tis_update_unhandled_irqs(struct tpm_chip *chip) +{ + struct tpm_tis_data *priv = dev_get_drvdata(&chip->dev); + irqreturn_t irqret = IRQ_HANDLED; + + if (!(chip->flags & TPM_CHIP_FLAG_IRQ)) + return IRQ_HANDLED; + + if (time_after(jiffies, priv->last_unhandled_irq + HZ/10)) + priv->unhandled_irqs = 1; + else + priv->unhandled_irqs++; + + priv->last_unhandled_irq = jiffies; + + if (priv->unhandled_irqs > TPM_TIS_MAX_UNHANDLED_IRQS) + irqret = tpm_tis_revert_interrupts(chip); + + return irqret; +} + static irqreturn_t tis_int_handler(int dummy, void *dev_id) { struct tpm_chip *chip = dev_id; @@ -761,10 +819,10 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id) rc = tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt); if (rc < 0) - return IRQ_NONE; + goto err; if (interrupt == 0) - return IRQ_NONE; + goto err; set_bit(TPM_TIS_IRQ_TESTED, &priv->flags); if (interrupt & TPM_INTF_DATA_AVAIL_INT) @@ -780,10 +838,13 @@ static irqreturn_t tis_int_handler(int dummy, void *dev_id) rc = tpm_tis_write32(priv, TPM_INT_STATUS(priv->locality), interrupt); tpm_tis_relinquish_locality(chip, 0); if (rc < 0) - return IRQ_NONE; + goto err; tpm_tis_read32(priv, TPM_INT_STATUS(priv->locality), &interrupt); return IRQ_HANDLED; + +err: + return tpm_tis_update_unhandled_irqs(chip); } static void tpm_tis_gen_interrupt(struct tpm_chip *chip) @@ -804,6 +865,15 @@ static void tpm_tis_gen_interrupt(struct tpm_chip *chip) chip->flags &= ~TPM_CHIP_FLAG_IRQ; } +static void tpm_tis_free_irq_func(struct work_struct *work) +{ + struct tpm_tis_data *priv = container_of(work, typeof(*priv), free_irq_work); + struct tpm_chip *chip = priv->chip; + + devm_free_irq(chip->dev.parent, priv->irq, chip); + priv->irq = 0; +} + /* Register the IRQ and issue a command that will cause an interrupt. If an * irq is seen then leave the chip setup for IRQ operation, otherwise reverse * everything and leave in polling mode. Returns 0 on success. @@ -816,6 +886,7 @@ static int tpm_tis_probe_irq_single(struct tpm_chip *chip, u32 intmask, int rc; u32 int_status; + INIT_WORK(&priv->free_irq_work, tpm_tis_free_irq_func); rc = devm_request_threaded_irq(chip->dev.parent, irq, NULL, tis_int_handler, IRQF_ONESHOT | flags, @@ -918,6 +989,7 @@ void tpm_tis_remove(struct tpm_chip *chip) interrupt = 0; tpm_tis_write32(priv, reg, ~TPM_GLOBAL_INT_ENABLE & interrupt); + flush_work(&priv->free_irq_work); tpm_tis_clkrun_enable(chip, false); @@ -1021,6 +1093,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, chip->timeout_b = msecs_to_jiffies(TIS_TIMEOUT_B_MAX); chip->timeout_c = msecs_to_jiffies(TIS_TIMEOUT_C_MAX); chip->timeout_d = msecs_to_jiffies(TIS_TIMEOUT_D_MAX); + priv->chip = chip; priv->timeout_min = TPM_TIMEOUT_USECS_MIN; priv->timeout_max = TPM_TIMEOUT_USECS_MAX; priv->phy_ops = phy_ops; @@ -1179,7 +1252,7 @@ int tpm_tis_core_init(struct device *dev, struct tpm_tis_data *priv, int irq, rc = tpm_tis_request_locality(chip, 0); if (rc < 0) goto out_err; - disable_interrupts(chip); + tpm_tis_disable_interrupts(chip); tpm_tis_relinquish_locality(chip, 0); } } diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index 610bfadb6acf..b1a169d7d1ca 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -91,11 +91,15 @@ enum tpm_tis_flags { }; struct tpm_tis_data { + struct tpm_chip *chip; u16 manufacturer_id; struct mutex locality_count_mutex; unsigned int locality_count; int locality; int irq; + struct work_struct free_irq_work; + unsigned long last_unhandled_irq; + unsigned int unhandled_irqs; unsigned int int_mask; unsigned long flags; void __iomem *ilb_base_addr; -- cgit v1.2.3 From 636e348353a7cc52609fdba5ff3270065da140d5 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 9 Jul 2023 01:33:44 +0200 Subject: prctl: move PR_GET_AUXV out of PR_MCE_KILL Somehow PR_GET_AUXV got added into PR_MCE_KILL's switch when the patch was applied [1]. Thus move it out of the switch, to the place the patch added it. In the recently released v6.4 kernel some user could, in principle, be already using this feature by mapping the right page and passing the PR_GET_AUXV constant as a pointer: prctl(PR_MCE_KILL, PR_GET_AUXV, ...) So this does change the behavior for users. We could keep the bug since the other subcases in PR_MCE_KILL (PR_MCE_KILL_CLEAR and PR_MCE_KILL_SET) do not overlap. However, v6.4 may be recent enough (2 weeks old) that moving the lines (rather than just adding a new case) does not break anybody? Moreover, the documentation in man-pages was just committed today [2]. Link: https://lkml.kernel.org/r/20230708233344.361854-1-ojeda@kernel.org Fixes: ddc65971bb67 ("prctl: add PR_GET_AUXV to copy auxv to userspace") Link: https://lore.kernel.org/all/d81864a7f7f43bca6afa2a09fc2e850e4050ab42.1680611394.git.josh@joshtriplett.org/ [1] Link: https://git.kernel.org/pub/scm/docs/man-pages/man-pages.git/commit/?id=8cf0c06bfd3c2b219b044d4151c96f0da50af9ad [2] Signed-off-by: Miguel Ojeda Cc: Josh Triplett Cc: Signed-off-by: Andrew Morton --- kernel/sys.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/sys.c b/kernel/sys.c index 05f838929e72..2410e3999ebe 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2535,11 +2535,6 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; - case PR_GET_AUXV: - if (arg4 || arg5) - return -EINVAL; - error = prctl_get_auxv((void __user *)arg2, arg3); - break; default: return -EINVAL; } @@ -2694,6 +2689,11 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SET_VMA: error = prctl_set_vma(arg2, arg3, arg4, arg5); break; + case PR_GET_AUXV: + if (arg4 || arg5) + return -EINVAL; + error = prctl_get_auxv((void __user *)arg2, arg3); + break; #ifdef CONFIG_KSM case PR_SET_MEMORY_MERGE: if (arg3 || arg4 || arg5) -- cgit v1.2.3 From 2658f94d679243209889cdfa8de3743cde1abea9 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 11 Jul 2023 13:50:20 -0400 Subject: mm/mlock: fix vma iterator conversion of apply_vma_lock_flags() apply_vma_lock_flags() calls mlock_fixup(), which could merge the VMA after where the vma iterator is located. Although this is not an issue, the next iteration of the loop will check the start of the vma to be equal to the locally saved 'tmp' variable and cause an incorrect failure scenario. Fix the error by setting tmp to the end of the vma iterator value before restarting the loop. There is also a potential of the error code being overwritten when the loop terminates early. Fix the return issue by directly returning when an error is encountered since there is nothing to undo after the loop. Link: https://lkml.kernel.org/r/20230711175020.4091336-1-Liam.Howlett@oracle.com Fixes: 37598f5a9d8b ("mlock: convert mlock to vma iterator") Signed-off-by: Liam R. Howlett Reported-by: Ryan Roberts Link: https://lore.kernel.org/linux-mm/50341ca1-d582-b33a-e3d0-acb08a65166f@arm.com/ Tested-by: Ryan Roberts Cc: Signed-off-by: Andrew Morton --- mm/mlock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index d7db94519884..0a0c996c5c21 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -477,7 +477,6 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, { unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; - int error; VMA_ITERATOR(vmi, current->mm, start); VM_BUG_ON(offset_in_page(start)); @@ -498,6 +497,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, nstart = start; tmp = vma->vm_start; for_each_vma_range(vmi, vma, end) { + int error; vm_flags_t newflags; if (vma->vm_start != tmp) @@ -511,14 +511,15 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, tmp = end; error = mlock_fixup(&vmi, vma, &prev, nstart, tmp, newflags); if (error) - break; + return error; + tmp = vma_iter_end(&vmi); nstart = tmp; } - if (vma_iter_end(&vmi) < end) + if (tmp < end) return -ENOMEM; - return error; + return 0; } /* -- cgit v1.2.3 From 3c769fd88b9742954763a968e84de09f7ad78cfe Mon Sep 17 00:00:00 2001 From: Peng Zhang Date: Tue, 11 Jul 2023 11:54:37 +0800 Subject: maple_tree: set the node limit when creating a new root node Set the node limit of the root node so that the last pivot of all nodes is the node limit (if the node is not full). This patch also fixes a bug in mas_rev_awalk(). Effectively, always setting a maximum makes mas_logical_pivot() behave as mas_safe_pivot(). Without this fix, it is possible that very small tasks would fail to find the correct gap. Although this has not been observed with real tasks, it has been reported to happen in m68k nommu running the maple tree tests. Link: https://lkml.kernel.org/r/20230711035444.526-1-zhangpeng.00@bytedance.com Link: https://lore.kernel.org/linux-mm/CAMuHMdV4T53fOw7VPoBgPR7fP6RYqf=CBhD_y_vOg53zZX_DnA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230711035444.526-2-zhangpeng.00@bytedance.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Peng Zhang Reviewed-by: Liam R. Howlett Tested-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bfffbb7cab26..4dd73cf936a6 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3692,7 +3692,8 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) mas->offset = slot; pivots[slot] = mas->last; if (mas->last != ULONG_MAX) - slot++; + pivots[++slot] = ULONG_MAX; + mas->depth = 1; mas_set_height(mas); ma_set_meta(node, maple_leaf_64, 0, slot); -- cgit v1.2.3 From 25b5949c30938c7f26dbadc948b491e0e0811c78 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Wed, 12 Jul 2023 14:46:48 +0100 Subject: selftests/mm: mkdirty: fix incorrect position of #endif The #endif is the wrong side of a } causing a build failure when __NR_userfaultfd is not defined. Fix this by moving the #end to enclose the } Link: https://lkml.kernel.org/r/20230712134648.456349-1-colin.i.king@gmail.com Fixes: 9eac40fc0cc7 ("selftests/mm: mkdirty: test behavior of (pte|pmd)_mkdirty on VMAs without write permissions") Signed-off-by: Colin Ian King Reviewed-by: David Hildenbrand Cc: Shuah Khan Cc: Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/mkdirty.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/mkdirty.c b/tools/testing/selftests/mm/mkdirty.c index 6d71d972997b..301abb99e027 100644 --- a/tools/testing/selftests/mm/mkdirty.c +++ b/tools/testing/selftests/mm/mkdirty.c @@ -321,8 +321,8 @@ close_uffd: munmap: munmap(dst, pagesize); free(src); -#endif /* __NR_userfaultfd */ } +#endif /* __NR_userfaultfd */ int main(void) { -- cgit v1.2.3 From 7a93c71a6714ca1a9c03d70432dac104b0cfb815 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 12 Jul 2023 13:39:15 -0400 Subject: maple_tree: fix 32 bit mas_next testing The test setup of mas_next is dependent on node entry size to create a 2 level tree, but the tests did not account for this in the expected value when shifting beyond the scope of the tree. Fix this by setting up the test to succeed depending on the node entries which is dependent on the 32/64 bit setup. Link: https://lkml.kernel.org/r/20230712173916.168805-1-Liam.Howlett@oracle.com Fixes: 120b116208a0 ("maple_tree: reorganize testing to restore module testing") Signed-off-by: Liam R. Howlett Reported-by: Geert Uytterhoeven Closes: https://lore.kernel.org/linux-mm/CAMuHMdV4T53fOw7VPoBgPR7fP6RYqf=CBhD_y_vOg53zZX_DnA@mail.gmail.com/ Tested-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 9939be34e516..8d4c92cbdd0c 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1898,13 +1898,16 @@ static noinline void __init next_prev_test(struct maple_tree *mt) 725}; static const unsigned long level2_32[] = { 1747, 2000, 1750, 1755, 1760, 1765}; + unsigned long last_index; if (MAPLE_32BIT) { nr_entries = 500; level2 = level2_32; + last_index = 0x138e; } else { nr_entries = 200; level2 = level2_64; + last_index = 0x7d6; } for (i = 0; i <= nr_entries; i++) @@ -2011,7 +2014,7 @@ static noinline void __init next_prev_test(struct maple_tree *mt) val = mas_next(&mas, ULONG_MAX); MT_BUG_ON(mt, val != NULL); - MT_BUG_ON(mt, mas.index != 0x7d6); + MT_BUG_ON(mt, mas.index != last_index); MT_BUG_ON(mt, mas.last != ULONG_MAX); val = mas_prev(&mas, 0); -- cgit v1.2.3 From ef5c3de5211b5a3a8102b25aa83eb4cde65ac2fd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 12 Jul 2023 13:39:16 -0400 Subject: maple_tree: fix node allocation testing on 32 bit Internal node counting was altered and the 64 bit test was updated, however the 32bit test was missed. Restore the 32bit test to a functional state. Link: https://lore.kernel.org/linux-mm/CAMuHMdV4T53fOw7VPoBgPR7fP6RYqf=CBhD_y_vOg53zZX_DnA@mail.gmail.com/ Link: https://lkml.kernel.org/r/20230712173916.168805-2-Liam.Howlett@oracle.com Fixes: 541e06b772c1 ("maple_tree: remove GFP_ZERO from kmem_cache_alloc() and kmem_cache_alloc_bulk()") Signed-off-by: Liam R. Howlett Cc: Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 03539d86cdf0..75ea2081a317 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -206,9 +206,9 @@ static noinline void __init check_new_node(struct maple_tree *mt) e = i - 1; } else { if (i >= 4) - e = i - 4; - else if (i == 3) - e = i - 2; + e = i - 3; + else if (i >= 1) + e = i - 1; else e = 0; } -- cgit v1.2.3 From f1a07c2b4e2c473ec322b8b9ece071b8c88a3512 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Jul 2023 12:03:21 +0100 Subject: btrfs: zoned: fix memory leak after finding block group with super blocks At exclude_super_stripes(), if we happen to find a block group that has super blocks mapped to it and we are on a zoned filesystem, we error out as this is not supposed to happen, indicating either a bug or maybe some memory corruption for example. However we are exiting the function without freeing the memory allocated for the logical address of the super blocks. Fix this by freeing the logical address. Fixes: 12659251ca5d ("btrfs: implement log-structured superblock for ZONED mode") CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index f53297726238..b7da14848686 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2088,6 +2088,7 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) /* Shouldn't have super stripes in sequential zones */ if (zoned && nr) { + kfree(logical); btrfs_err(fs_info, "zoned: block group %llu must not contain super block", cache->start); -- cgit v1.2.3 From b777d279ff31979add57e8a3f810bceb7ef0cfb7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Jul 2023 18:15:30 +0100 Subject: btrfs: fix double iput() on inode after an error during orphan cleanup At btrfs_orphan_cleanup(), if we were able to find the inode, we do an iput() on the inode, then if btrfs_drop_verity_items() succeeds and then either btrfs_start_transaction() or btrfs_del_orphan_item() fail, we do another iput() in the respective error paths, resulting in an extra iput() on the inode. Fix this by setting inode to NULL after the first iput(), as iput() ignores a NULL inode pointer argument. Fixes: a13bb2c03848 ("btrfs: add missing iputs on orphan cleanup failure") CC: stable@vger.kernel.org # 6.4 Reviewed-by: Boris Burkov Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dbbb67293e34..d919318d2498 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3728,6 +3728,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (!ret) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); + inode = NULL; if (ret) goto out; } -- cgit v1.2.3 From cbaee87f2ef628c10331b69a2f3def6bc32402d7 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 3 Jul 2023 18:15:31 +0100 Subject: btrfs: fix iput() on error pointer after error during orphan cleanup At btrfs_orphan_cleanup(), if we can't find an inode (btrfs_iget() returns an -ENOENT error pointer), we proceed with 'ret' set to -ENOENT and the inode pointer set to ERR_PTR(-ENOENT). Later when we proceed to the body of the following if statement: if (ret == -ENOENT || inode->i_nlink) { (...) trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); iput(inode); goto out; } (...) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); if (ret) { iput(inode); goto out; } continue; } If we get an error from btrfs_start_transaction() or from the call to btrfs_del_orphan_item() we end calling iput() against an inode pointer that has a value of ERR_PTR(-ENOENT), resulting in a crash with the following trace: [876.667] BUG: kernel NULL pointer dereference, address: 0000000000000096 [876.667] #PF: supervisor read access in kernel mode [876.667] #PF: error_code(0x0000) - not-present page [876.667] PGD 0 P4D 0 [876.668] Oops: 0000 [#1] PREEMPT SMP PTI [876.668] CPU: 0 PID: 2356187 Comm: mount Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [876.668] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [876.668] RIP: 0010:iput+0xa/0x20 [876.668] Code: ff ff ff 66 (...) [876.669] RSP: 0018:ffffafa9c0c9f9d0 EFLAGS: 00010282 [876.669] RAX: ffffffffffffffe4 RBX: 000000000009453b RCX: 0000000000000000 [876.669] RDX: 0000000000000001 RSI: ffffafa9c0c9f930 RDI: fffffffffffffffe [876.669] RBP: ffff95c612f3b800 R08: 0000000000000001 R09: ffffffffffffffe4 [876.670] R10: 00018f2a71010000 R11: 000000000ead96e3 R12: ffff95cb7d6909a0 [876.670] R13: fffffffffffffffe R14: ffff95c60f477000 R15: 00000000ffffffe4 [876.670] FS: 00007f5fbe30a840(0000) GS:ffff95ccdfa00000(0000) knlGS:0000000000000000 [876.670] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [876.671] CR2: 0000000000000096 CR3: 000000055e9f6004 CR4: 0000000000370ef0 [876.671] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [876.671] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [876.672] Call Trace: [876.744] [876.744] ? __die_body+0x1b/0x60 [876.744] ? page_fault_oops+0x15d/0x450 [876.745] ? __kmem_cache_alloc_node+0x47/0x410 [876.745] ? do_user_addr_fault+0x65/0x8a0 [876.745] ? exc_page_fault+0x74/0x170 [876.746] ? asm_exc_page_fault+0x22/0x30 [876.746] ? iput+0xa/0x20 [876.746] btrfs_orphan_cleanup+0x221/0x330 [btrfs] [876.746] btrfs_lookup_dentry+0x58f/0x5f0 [btrfs] [876.747] btrfs_lookup+0xe/0x30 [btrfs] [876.747] __lookup_slow+0x82/0x130 [876.785] walk_component+0xe5/0x160 [876.786] path_lookupat.isra.0+0x6e/0x150 [876.786] filename_lookup+0xcf/0x1a0 [876.786] ? mod_objcg_state+0xd2/0x360 [876.786] ? obj_cgroup_charge+0xf5/0x110 [876.787] ? should_failslab+0xa/0x20 [876.787] ? kmem_cache_alloc+0x47/0x450 [876.787] vfs_path_lookup+0x51/0x90 [876.788] mount_subtree+0x8d/0x130 [876.788] btrfs_mount+0x149/0x410 [btrfs] [876.788] ? __kmem_cache_alloc_node+0x47/0x410 [876.788] ? vfs_parse_fs_param+0xc0/0x110 [876.789] legacy_get_tree+0x24/0x50 [876.834] vfs_get_tree+0x22/0xd0 [876.852] path_mount+0x2d8/0x9c0 [876.852] do_mount+0x79/0x90 [876.852] __x64_sys_mount+0x8e/0xd0 [876.853] do_syscall_64+0x38/0x90 [876.899] entry_SYSCALL_64_after_hwframe+0x72/0xdc [876.958] RIP: 0033:0x7f5fbe50b76a [876.959] Code: 48 8b 0d a9 (...) [876.959] RSP: 002b:00007fff01925798 EFLAGS: 00000246 ORIG_RAX: 00000000000000a5 [876.959] RAX: ffffffffffffffda RBX: 00007f5fbe694264 RCX: 00007f5fbe50b76a [876.960] RDX: 0000561bde6c8720 RSI: 0000561bde6bdec0 RDI: 0000561bde6c31a0 [876.960] RBP: 0000561bde6bdc70 R08: 0000000000000000 R09: 0000000000000001 [876.960] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 [876.960] R13: 0000561bde6c31a0 R14: 0000561bde6c8720 R15: 0000561bde6bdc70 [876.960] So fix this by setting 'inode' to NULL whenever we get an error from btrfs_iget(), and to make the code simpler, stop testing for 'ret' being -ENOENT to check if we have an inode - instead test for 'inode' being NULL or not. Having a NULL 'inode' prevents any iput() call from crashing, as iput() ignores NULL inode pointers. Also, stop testing for a NULL return value from btrfs_iget() with PTR_ERR_OR_ZERO(), because btrfs_iget() never returns NULL - in case an inode is not found, it returns ERR_PTR(-ENOENT), and in case of memory allocation failure, it returns ERR_PTR(-ENOMEM). We also don't need the extra iput() calls on the error branches for the btrfs_start_transaction() and btrfs_del_orphan_item() calls, as we have already called iput() before, so remove them. Fixes: a13bb2c03848 ("btrfs: add missing iputs on orphan cleanup failure") CC: stable@vger.kernel.org # 6.4 Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d919318d2498..c8921589e2f3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3659,11 +3659,14 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) found_key.type = BTRFS_INODE_ITEM_KEY; found_key.offset = 0; inode = btrfs_iget(fs_info->sb, last_objectid, root); - ret = PTR_ERR_OR_ZERO(inode); - if (ret && ret != -ENOENT) - goto out; + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + inode = NULL; + if (ret != -ENOENT) + goto out; + } - if (ret == -ENOENT && root == fs_info->tree_root) { + if (!inode && root == fs_info->tree_root) { struct btrfs_root *dead_root; int is_dead_root = 0; @@ -3724,8 +3727,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (ret == -ENOENT || inode->i_nlink) { - if (!ret) { + if (!inode || inode->i_nlink) { + if (inode) { ret = btrfs_drop_verity_items(BTRFS_I(inode)); iput(inode); inode = NULL; @@ -3735,7 +3738,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) trans = btrfs_start_transaction(root, 1); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - iput(inode); goto out; } btrfs_debug(fs_info, "auto deleting %Lu", @@ -3743,10 +3745,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = btrfs_del_orphan_item(trans, root, found_key.objectid); btrfs_end_transaction(trans); - if (ret) { - iput(inode); + if (ret) goto out; - } continue; } -- cgit v1.2.3 From 866e98a4d95d93de2d485f82c69ffeabd712e48b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 6 Jul 2023 00:41:16 +0100 Subject: btrfs: use irq safe locking when running and adding delayed iputs Running delayed iputs, which never happens in an irq context, needs to lock the spinlock fs_info->delayed_iput_lock. When finishing bios for data writes (irq context, bio.c) we call btrfs_put_ordered_extent() which needs to add a delayed iput and for that it needs to acquire the spinlock fs_info->delayed_iput_lock. Without disabling irqs when running delayed iputs we can therefore deadlock on that spinlock. The same deadlock can also happen when adding an inode to the delayed iputs list, since this can be done outside an irq context as well. Syzbot recently reported this, which results in the following trace: ================================ WARNING: inconsistent lock state 6.4.0-syzkaller-09904-ga507db1d8fdc #0 Not tainted -------------------------------- inconsistent {IN-SOFTIRQ-W} -> {SOFTIRQ-ON-W} usage. btrfs-cleaner/16079 [HC0[0]:SC0[0]:HE1:SE1] takes: ffff888107804d20 (&fs_info->delayed_iput_lock){+.?.}-{2:2}, at: spin_lock include/linux/spinlock.h:350 [inline] ffff888107804d20 (&fs_info->delayed_iput_lock){+.?.}-{2:2}, at: btrfs_run_delayed_iputs+0x28/0xe0 fs/btrfs/inode.c:3523 {IN-SOFTIRQ-W} state was registered at: lock_acquire kernel/locking/lockdep.c:5761 [inline] lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5726 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:350 [inline] btrfs_add_delayed_iput+0x128/0x390 fs/btrfs/inode.c:3490 btrfs_put_ordered_extent fs/btrfs/ordered-data.c:559 [inline] btrfs_put_ordered_extent+0x2f6/0x610 fs/btrfs/ordered-data.c:547 __btrfs_bio_end_io fs/btrfs/bio.c:118 [inline] __btrfs_bio_end_io+0x136/0x180 fs/btrfs/bio.c:112 btrfs_orig_bbio_end_io+0x86/0x2b0 fs/btrfs/bio.c:163 btrfs_simple_end_io+0x105/0x380 fs/btrfs/bio.c:378 bio_endio+0x589/0x690 block/bio.c:1617 req_bio_endio block/blk-mq.c:766 [inline] blk_update_request+0x5c5/0x1620 block/blk-mq.c:911 blk_mq_end_request+0x59/0x680 block/blk-mq.c:1032 lo_complete_rq+0x1c6/0x280 drivers/block/loop.c:370 blk_complete_reqs+0xb3/0xf0 block/blk-mq.c:1110 __do_softirq+0x1d4/0x905 kernel/softirq.c:553 run_ksoftirqd kernel/softirq.c:921 [inline] run_ksoftirqd+0x31/0x60 kernel/softirq.c:913 smpboot_thread_fn+0x659/0x9e0 kernel/smpboot.c:164 kthread+0x344/0x440 kernel/kthread.c:389 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 irq event stamp: 39 hardirqs last enabled at (39): [] __do_kmem_cache_free mm/slab.c:3558 [inline] hardirqs last enabled at (39): [] kmem_cache_free mm/slab.c:3582 [inline] hardirqs last enabled at (39): [] kmem_cache_free+0x244/0x370 mm/slab.c:3575 hardirqs last disabled at (38): [] __do_kmem_cache_free mm/slab.c:3553 [inline] hardirqs last disabled at (38): [] kmem_cache_free mm/slab.c:3582 [inline] hardirqs last disabled at (38): [] kmem_cache_free+0x1de/0x370 mm/slab.c:3575 softirqs last enabled at (0): [] copy_process+0x227f/0x75c0 kernel/fork.c:2448 softirqs last disabled at (0): [<0000000000000000>] 0x0 other info that might help us debug this: Possible unsafe locking scenario: CPU0 ---- lock(&fs_info->delayed_iput_lock); lock(&fs_info->delayed_iput_lock); *** DEADLOCK *** 1 lock held by btrfs-cleaner/16079: #0: ffff888107804860 (&fs_info->cleaner_mutex){+.+.}-{3:3}, at: cleaner_kthread+0x103/0x4b0 fs/btrfs/disk-io.c:1463 stack backtrace: CPU: 3 PID: 16079 Comm: btrfs-cleaner Not tainted 6.4.0-syzkaller-09904-ga507db1d8fdc #0 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.14.0-2 04/01/2014 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106 print_usage_bug kernel/locking/lockdep.c:3978 [inline] valid_state kernel/locking/lockdep.c:4020 [inline] mark_lock_irq kernel/locking/lockdep.c:4223 [inline] mark_lock.part.0+0x1102/0x1960 kernel/locking/lockdep.c:4685 mark_lock kernel/locking/lockdep.c:4649 [inline] mark_usage kernel/locking/lockdep.c:4598 [inline] __lock_acquire+0x8e4/0x5e20 kernel/locking/lockdep.c:5098 lock_acquire kernel/locking/lockdep.c:5761 [inline] lock_acquire+0x1b1/0x520 kernel/locking/lockdep.c:5726 __raw_spin_lock include/linux/spinlock_api_smp.h:133 [inline] _raw_spin_lock+0x2e/0x40 kernel/locking/spinlock.c:154 spin_lock include/linux/spinlock.h:350 [inline] btrfs_run_delayed_iputs+0x28/0xe0 fs/btrfs/inode.c:3523 cleaner_kthread+0x2e5/0x4b0 fs/btrfs/disk-io.c:1478 kthread+0x344/0x440 kernel/kthread.c:389 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 So fix this by using spin_lock_irq() and spin_unlock_irq() when running delayed iputs, and using spin_lock_irqsave() and spin_unlock_irqrestore() when adding a delayed iput(). Reported-by: syzbot+da501a04be5ff533b102@syzkaller.appspotmail.com Fixes: ec63b84d4611 ("btrfs: add an ordered_extent pointer to struct btrfs_bio") Link: https://lore.kernel.org/linux-btrfs/000000000000d5c89a05ffbd39dd@google.com/ Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c8921589e2f3..b12850b31cb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3482,15 +3482,21 @@ zeroit: void btrfs_add_delayed_iput(struct btrfs_inode *inode) { struct btrfs_fs_info *fs_info = inode->root->fs_info; + unsigned long flags; if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; atomic_inc(&fs_info->nr_delayed_iputs); - spin_lock(&fs_info->delayed_iput_lock); + /* + * Need to be irq safe here because we can be called from either an irq + * context (see bio.c and btrfs_put_ordered_extent()) or a non-irq + * context. + */ + spin_lock_irqsave(&fs_info->delayed_iput_lock, flags); ASSERT(list_empty(&inode->delayed_iput)); list_add_tail(&inode->delayed_iput, &fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irqrestore(&fs_info->delayed_iput_lock, flags); if (!test_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); } @@ -3499,37 +3505,46 @@ static void run_delayed_iput_locked(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { list_del_init(&inode->delayed_iput); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); iput(&inode->vfs_inode); if (atomic_dec_and_test(&fs_info->nr_delayed_iputs)) wake_up(&fs_info->delayed_iputs_wait); - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); } static void btrfs_run_delayed_iput(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) { if (!list_empty(&inode->delayed_iput)) { - spin_lock(&fs_info->delayed_iput_lock); + spin_lock_irq(&fs_info->delayed_iput_lock); if (!list_empty(&inode->delayed_iput)) run_delayed_iput_locked(fs_info, inode); - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } } void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) { - - spin_lock(&fs_info->delayed_iput_lock); + /* + * btrfs_put_ordered_extent() can run in irq context (see bio.c), which + * calls btrfs_add_delayed_iput() and that needs to lock + * fs_info->delayed_iput_lock. So we need to disable irqs here to + * prevent a deadlock. + */ + spin_lock_irq(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); - cond_resched_lock(&fs_info->delayed_iput_lock); + if (need_resched()) { + spin_unlock_irq(&fs_info->delayed_iput_lock); + cond_resched(); + spin_lock_irq(&fs_info->delayed_iput_lock); + } } - spin_unlock(&fs_info->delayed_iput_lock); + spin_unlock_irq(&fs_info->delayed_iput_lock); } /* -- cgit v1.2.3 From 486c737f7fdc0c3f6464cf27ede811daec2769a1 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 30 Jun 2023 08:56:40 +0800 Subject: btrfs: raid56: always verify the P/Q contents for scrub [REGRESSION] Commit 75b470332965 ("btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap") changed the behavior of scrub_rbio(). Initially if we have no error reading the raid bio, we will assign @need_check to true, then finish_parity_scrub() would later verify the content of P/Q stripes before writeback. But after that commit we never verify the content of P/Q stripes and just writeback them. This can lead to unrepaired P/Q stripes during scrub, or already corrupted P/Q copied to the dev-replace target. [FIX] The situation is more complex than the regression, in fact the initial behavior is not 100% correct either. If we have the following rare case, it can still lead to the same problem using the old behavior: 0 16K 32K 48K 64K Data 1: |IIIIIII| | Data 2: | | Parity: | |CCCCCCC| | Where "I" means IO error, "C" means corruption. In the above case, we're scrubbing the parity stripe, then read out all the contents of Data 1, Data 2, Parity stripes. But found IO error in Data 1, which leads to rebuild using Data 2 and Parity and got the correct data. In that case, we would not verify if the Parity is correct for range [16K, 32K). So here we have to always verify the content of Parity no matter if we did recovery or not. This patch would remove the @need_check parameter of finish_parity_scrub() completely, and would always do the P/Q verification before writeback. Fixes: 75b470332965 ("btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap") CC: stable@vger.kernel.org # 6.2+ Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/raid56.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index f37b925d587f..0249ea52bb80 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -71,7 +71,7 @@ static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio); static void scrub_rbio_work_locked(struct work_struct *work); static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) @@ -2404,7 +2404,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2445,9 +2445,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - if (!need_check) - goto writeback; - p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) return -ENOMEM; @@ -2516,7 +2513,6 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) q_sector.page = NULL; } -writeback: /* * time to start writing. Make bios for everything from the * higher layers (the bio_list in our rbio) and our p/q. Ignore @@ -2699,7 +2695,6 @@ static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio) static void scrub_rbio(struct btrfs_raid_bio *rbio) { - bool need_check = false; int sector_nr; int ret; @@ -2722,7 +2717,7 @@ static void scrub_rbio(struct btrfs_raid_bio *rbio) * We have every sector properly prepared. Can finish the scrub * and writeback the good content. */ - ret = finish_parity_scrub(rbio, need_check); + ret = finish_parity_scrub(rbio); wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int found_errors; -- cgit v1.2.3 From 17b17fcd6d446b95904a6929c40012ee7f0afc0c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 12 Jul 2023 12:44:12 -0400 Subject: btrfs: set_page_extent_mapped after read_folio in btrfs_cont_expand While trying to get the subpage blocksize tests running, I hit the following panic on generic/476 assertion failed: PagePrivate(page) && page->private, in fs/btrfs/subpage.c:229 kernel BUG at fs/btrfs/subpage.c:229! Internal error: Oops - BUG: 00000000f2000800 [#1] SMP CPU: 1 PID: 1453 Comm: fsstress Not tainted 6.4.0-rc7+ #12 Hardware name: QEMU KVM Virtual Machine, BIOS edk2-20230301gitf80f052277c8-26.fc38 03/01/2023 pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) pc : btrfs_subpage_assert+0xbc/0xf0 lr : btrfs_subpage_assert+0xbc/0xf0 Call trace: btrfs_subpage_assert+0xbc/0xf0 btrfs_subpage_clear_checked+0x38/0xc0 btrfs_page_clear_checked+0x48/0x98 btrfs_truncate_block+0x5d0/0x6a8 btrfs_cont_expand+0x5c/0x528 btrfs_write_check.isra.0+0xf8/0x150 btrfs_buffered_write+0xb4/0x760 btrfs_do_write_iter+0x2f8/0x4b0 btrfs_file_write_iter+0x1c/0x30 do_iter_readv_writev+0xc8/0x158 do_iter_write+0x9c/0x210 vfs_iter_write+0x24/0x40 iter_file_splice_write+0x224/0x390 direct_splice_actor+0x38/0x68 splice_direct_to_actor+0x12c/0x260 do_splice_direct+0x90/0xe8 generic_copy_file_range+0x50/0x90 vfs_copy_file_range+0x29c/0x470 __arm64_sys_copy_file_range+0xcc/0x498 invoke_syscall.constprop.0+0x80/0xd8 do_el0_svc+0x6c/0x168 el0_svc+0x50/0x1b0 el0t_64_sync_handler+0x114/0x120 el0t_64_sync+0x194/0x198 This happens because during btrfs_cont_expand we'll get a page, set it as mapped, and if it's not Uptodate we'll read it. However between the read and re-locking the page we could have called release_folio() on the page, but left the page in the file mapping. release_folio() can clear the page private, and thus further down we blow up when we go to modify the subpage bits. Fix this by putting the set_page_extent_mapped() after the read. This is safe because read_folio() will call set_page_extent_mapped() before it does the read, and then if we clear page private but leave it on the mapping we're completely safe re-setting set_page_extent_mapped(). With this patch I can now run generic/476 without panicing. CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Christoph Hellwig Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/inode.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b12850b31cb3..1f58debb9a04 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4863,9 +4863,6 @@ again: ret = -ENOMEM; goto out; } - ret = set_page_extent_mapped(page); - if (ret < 0) - goto out_unlock; if (!PageUptodate(page)) { ret = btrfs_read_folio(NULL, page_folio(page)); @@ -4880,6 +4877,17 @@ again: goto out_unlock; } } + + /* + * We unlock the page after the io is completed and then re-lock it + * above. release_folio() could have come in between that and cleared + * PagePrivate(), but left the page in the mapping. Set the page mapped + * here to make sure it's properly set for the subpage stuff. + */ + ret = set_page_extent_mapped(page); + if (ret < 0) + goto out_unlock; + wait_on_page_writeback(page); lock_extent(io_tree, block_start, block_end, &cached_state); -- cgit v1.2.3 From 7cad645ebf20d777b2a48750ebd80fd81593b78c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 14 Jul 2023 10:42:41 +0200 Subject: btrfs: fix ordered extent split error handling in btrfs_dio_submit_io When the call to btrfs_extract_ordered_extent in btrfs_dio_submit_io fails to allocate memory for a new ordered_extent, it calls into the btrfs_dio_end_io for error handling. btrfs_dio_end_io then assumes that bbio->ordered is set because it is supposed to be at this point, except for this error handling corner case. Try to not overload the btrfs_dio_end_io with error handling of a bio in a non-canonical state, and instead call btrfs_finish_ordered_extent and iomap_dio_bio_end_io directly for this error case. Reported-by: syzbot Fixes: b41b6f6937dc ("btrfs: use btrfs_finish_ordered_extent to complete direct writes") Reviewed-by: Josef Bacik Tested-by: syzbot Signed-off-by: Christoph Hellwig Signed-off-by: David Sterba --- fs/btrfs/inode.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1f58debb9a04..49cef61f6a39 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7873,8 +7873,11 @@ static void btrfs_dio_submit_io(const struct iomap_iter *iter, struct bio *bio, ret = btrfs_extract_ordered_extent(bbio, dio_data->ordered); if (ret) { - bbio->bio.bi_status = errno_to_blk_status(ret); - btrfs_dio_end_io(bbio); + btrfs_finish_ordered_extent(dio_data->ordered, NULL, + file_offset, dip->bytes, + !ret); + bio->bi_status = errno_to_blk_status(ret); + iomap_dio_bio_end_io(bio); return; } } -- cgit v1.2.3 From aa84ce8a78a1a5c10cdf9c7a5fb0c999fbc2c8d6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 14 Jul 2023 13:42:06 +0100 Subject: btrfs: fix warning when putting transaction with qgroups enabled after abort If we have a transaction abort with qgroups enabled we get a warning triggered when doing the final put on the transaction, like this: [552.6789] ------------[ cut here ]------------ [552.6815] WARNING: CPU: 4 PID: 81745 at fs/btrfs/transaction.c:144 btrfs_put_transaction+0x123/0x130 [btrfs] [552.6817] Modules linked in: btrfs blake2b_generic xor (...) [552.6819] CPU: 4 PID: 81745 Comm: btrfs-transacti Tainted: G W 6.4.0-rc6-btrfs-next-134+ #1 [552.6819] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [552.6819] RIP: 0010:btrfs_put_transaction+0x123/0x130 [btrfs] [552.6821] Code: bd a0 01 00 (...) [552.6821] RSP: 0018:ffffa168c0527e28 EFLAGS: 00010286 [552.6821] RAX: ffff936042caed00 RBX: ffff93604a3eb448 RCX: 0000000000000000 [552.6821] RDX: ffff93606421b028 RSI: ffffffff92ff0878 RDI: ffff93606421b010 [552.6821] RBP: ffff93606421b000 R08: 0000000000000000 R09: ffffa168c0d07c20 [552.6821] R10: 0000000000000000 R11: ffff93608dc52950 R12: ffffa168c0527e70 [552.6821] R13: ffff93606421b000 R14: ffff93604a3eb420 R15: ffff93606421b028 [552.6821] FS: 0000000000000000(0000) GS:ffff93675fb00000(0000) knlGS:0000000000000000 [552.6821] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [552.6821] CR2: 0000558ad262b000 CR3: 000000014feda005 CR4: 0000000000370ee0 [552.6822] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [552.6822] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [552.6822] Call Trace: [552.6822] [552.6822] ? __warn+0x80/0x130 [552.6822] ? btrfs_put_transaction+0x123/0x130 [btrfs] [552.6824] ? report_bug+0x1f4/0x200 [552.6824] ? handle_bug+0x42/0x70 [552.6824] ? exc_invalid_op+0x14/0x70 [552.6824] ? asm_exc_invalid_op+0x16/0x20 [552.6824] ? btrfs_put_transaction+0x123/0x130 [btrfs] [552.6826] btrfs_cleanup_transaction+0xe7/0x5e0 [btrfs] [552.6828] ? _raw_spin_unlock_irqrestore+0x23/0x40 [552.6828] ? try_to_wake_up+0x94/0x5e0 [552.6828] ? __pfx_process_timeout+0x10/0x10 [552.6828] transaction_kthread+0x103/0x1d0 [btrfs] [552.6830] ? __pfx_transaction_kthread+0x10/0x10 [btrfs] [552.6832] kthread+0xee/0x120 [552.6832] ? __pfx_kthread+0x10/0x10 [552.6832] ret_from_fork+0x29/0x50 [552.6832] [552.6832] ---[ end trace 0000000000000000 ]--- This corresponds to this line of code: void btrfs_put_transaction(struct btrfs_transaction *transaction) { (...) WARN_ON(!RB_EMPTY_ROOT( &transaction->delayed_refs.dirty_extent_root)); (...) } The warning happens because btrfs_qgroup_destroy_extent_records(), called in the transaction abort path, we free all entries from the rbtree "dirty_extent_root" with rbtree_postorder_for_each_entry_safe(), but we don't actually empty the rbtree - it's still pointing to nodes that were freed. So set the rbtree's root node to NULL to avoid this warning (assign RB_ROOT). Fixes: 81f7eb00ff5b ("btrfs: destroy qgroup extent records on transaction abort") CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Josef Bacik Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/qgroup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index da1f84a0eb29..2637d6b157ff 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -4445,4 +4445,5 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) ulist_free(entry->old_roots); kfree(entry); } + *root = RB_ROOT; } -- cgit v1.2.3 From 2033ab90380d46e0e9f0520fd6776a73d107fd95 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Sat, 15 Jul 2023 18:36:05 +0300 Subject: vrf: Fix lockdep splat in output path Cited commit converted the neighbour code to use the standard RCU variant instead of the RCU-bh variant, but the VRF code still uses rcu_read_lock_bh() / rcu_read_unlock_bh() around the neighbour lookup code in its IPv4 and IPv6 output paths, resulting in lockdep splats [1][2]. Can be reproduced using [3]. Fix by switching to rcu_read_lock() / rcu_read_unlock(). [1] ============================= WARNING: suspicious RCU usage 6.5.0-rc1-custom-g9c099e6dbf98 #403 Not tainted ----------------------------- include/net/neighbour.h:302 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by ping/183: #0: ffff888105ea1d80 (sk_lock-AF_INET){+.+.}-{0:0}, at: raw_sendmsg+0xc6c/0x33c0 #1: ffffffff85b46820 (rcu_read_lock_bh){....}-{1:2}, at: vrf_output+0x2e3/0x2030 stack backtrace: CPU: 0 PID: 183 Comm: ping Not tainted 6.5.0-rc1-custom-g9c099e6dbf98 #403 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc37 04/01/2014 Call Trace: dump_stack_lvl+0xc1/0xf0 lockdep_rcu_suspicious+0x211/0x3b0 vrf_output+0x1380/0x2030 ip_push_pending_frames+0x125/0x2a0 raw_sendmsg+0x200d/0x33c0 inet_sendmsg+0xa2/0xe0 __sys_sendto+0x2aa/0x420 __x64_sys_sendto+0xe5/0x1c0 do_syscall_64+0x38/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [2] ============================= WARNING: suspicious RCU usage 6.5.0-rc1-custom-g9c099e6dbf98 #403 Not tainted ----------------------------- include/net/neighbour.h:302 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 2 locks held by ping6/182: #0: ffff888114b63000 (sk_lock-AF_INET6){+.+.}-{0:0}, at: rawv6_sendmsg+0x1602/0x3e50 #1: ffffffff85b46820 (rcu_read_lock_bh){....}-{1:2}, at: vrf_output6+0xe9/0x1310 stack backtrace: CPU: 0 PID: 182 Comm: ping6 Not tainted 6.5.0-rc1-custom-g9c099e6dbf98 #403 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.2-1.fc37 04/01/2014 Call Trace: dump_stack_lvl+0xc1/0xf0 lockdep_rcu_suspicious+0x211/0x3b0 vrf_output6+0xd32/0x1310 ip6_local_out+0xb4/0x1a0 ip6_send_skb+0xbc/0x340 ip6_push_pending_frames+0xe5/0x110 rawv6_sendmsg+0x2e6e/0x3e50 inet_sendmsg+0xa2/0xe0 __sys_sendto+0x2aa/0x420 __x64_sys_sendto+0xe5/0x1c0 do_syscall_64+0x38/0x80 entry_SYSCALL_64_after_hwframe+0x63/0xcd [3] #!/bin/bash ip link add name vrf-red up numtxqueues 2 type vrf table 10 ip link add name swp1 up master vrf-red type dummy ip address add 192.0.2.1/24 dev swp1 ip address add 2001:db8:1::1/64 dev swp1 ip neigh add 192.0.2.2 lladdr 00:11:22:33:44:55 nud perm dev swp1 ip neigh add 2001:db8:1::2 lladdr 00:11:22:33:44:55 nud perm dev swp1 ip vrf exec vrf-red ping 192.0.2.2 -c 1 &> /dev/null ip vrf exec vrf-red ping6 2001:db8:1::2 -c 1 &> /dev/null Fixes: 09eed1192cec ("neighbour: switch to standard rcu, instead of rcu_bh") Reported-by: Naresh Kamboju Link: https://lore.kernel.org/netdev/CA+G9fYtEr-=GbcXNDYo3XOkwR+uYgehVoDjsP0pFLUpZ_AZcyg@mail.gmail.com/ Signed-off-by: Ido Schimmel Reviewed-by: David Ahern Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20230715153605.4068066-1-idosch@nvidia.com Signed-off-by: Paolo Abeni --- drivers/net/vrf.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index bdb3a76a352e..6043e63b42f9 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -664,7 +664,7 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, skb->protocol = htons(ETH_P_IPV6); skb->dev = dev; - rcu_read_lock_bh(); + rcu_read_lock(); nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr); neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop); if (unlikely(!neigh)) @@ -672,10 +672,10 @@ static int vrf_finish_output6(struct net *net, struct sock *sk, if (!IS_ERR(neigh)) { sock_confirm_neigh(skb, neigh); ret = neigh_output(neigh, skb, false); - rcu_read_unlock_bh(); + rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES); @@ -889,7 +889,7 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s } } - rcu_read_lock_bh(); + rcu_read_lock(); neigh = ip_neigh_for_gw(rt, skb, &is_v6gw); if (!IS_ERR(neigh)) { @@ -898,11 +898,11 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s sock_confirm_neigh(skb, neigh); /* if crossing protocols, can not use the cached header */ ret = neigh_output(neigh, skb, is_v6gw); - rcu_read_unlock_bh(); + rcu_read_unlock(); return ret; } - rcu_read_unlock_bh(); + rcu_read_unlock(); vrf_tx_error(skb->dev, skb); return -EINVAL; } -- cgit v1.2.3 From 8fcd7c7b3a38ab5e452f542fda8f7940e77e479a Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Sun, 16 Jul 2023 15:07:41 +0530 Subject: octeontx2-pf: Dont allocate BPIDs for LBK interfaces Current driver enables backpressure for LBK interfaces. But these interfaces do not support this feature. Hence, this patch fixes the issue by skipping the backpressure configuration for these interfaces. Fixes: 75f36270990c ("octeontx2-pf: Support to enable/disable pause frames via ethtool"). Signed-off-by: Geetha sowjanya Signed-off-by: Sunil Goutham Link: https://lore.kernel.org/r/20230716093741.28063-1-gakula@marvell.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index fe8ea4e531b7..9551b422622a 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1454,8 +1454,9 @@ static int otx2_init_hw_resources(struct otx2_nic *pf) if (err) goto err_free_npa_lf; - /* Enable backpressure */ - otx2_nix_config_bp(pf, true); + /* Enable backpressure for CGX mapped PF/VFs */ + if (!is_otx2_lbkvf(pf->pdev)) + otx2_nix_config_bp(pf, true); /* Init Auras and pools used by NIX RQ, for free buffer ptrs */ err = otx2_rq_aura_pool_init(pf); -- cgit v1.2.3 From ba7b3e7d5f9014be65879ede8fd599cb222901c9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:28 +0530 Subject: bpf: Fix subprog idx logic in check_max_stack_depth The assignment to idx in check_max_stack_depth happens once we see a bpf_pseudo_call or bpf_pseudo_func. This is not an issue as the rest of the code performs a few checks and then pushes the frame to the frame stack, except the case of async callbacks. If the async callback case causes the loop iteration to be skipped, the idx assignment will be incorrect on the next iteration of the loop. The value stored in the frame stack (as the subprogno of the current subprog) will be incorrect. This leads to incorrect checks and incorrect tail_call_reachable marking. Save the target subprog in a new variable and only assign to idx once we are done with the is_async_cb check which may skip pushing of frame to the frame stack and subsequent stack depth checks and tail call markings. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-2-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 930b5555cfd3..e682056dd144 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5621,7 +5621,7 @@ process_func: continue_func: subprog_end = subprog[idx + 1].start; for (; i < subprog_end; i++) { - int next_insn; + int next_insn, sidx; if (!bpf_pseudo_call(insn + i) && !bpf_pseudo_func(insn + i)) continue; @@ -5631,14 +5631,14 @@ continue_func: /* find the callee */ next_insn = i + insn[i].imm + 1; - idx = find_subprog(env, next_insn); - if (idx < 0) { + sidx = find_subprog(env, next_insn); + if (sidx < 0) { WARN_ONCE(1, "verifier bug. No program starts at insn %d\n", next_insn); return -EFAULT; } - if (subprog[idx].is_async_cb) { - if (subprog[idx].has_tail_call) { + if (subprog[sidx].is_async_cb) { + if (subprog[sidx].has_tail_call) { verbose(env, "verifier bug. subprog has tail_call and async cb\n"); return -EFAULT; } @@ -5647,6 +5647,7 @@ continue_func: continue; } i = next_insn; + idx = sidx; if (subprog[idx].has_tail_call) tail_call_reachable = true; -- cgit v1.2.3 From b5e9ad522c4ccd32d322877515cff8d47ed731b9 Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:29 +0530 Subject: bpf: Repeat check_max_stack_depth for async callbacks While the check_max_stack_depth function explores call chains emanating from the main prog, which is typically enough to cover all possible call chains, it doesn't explore those rooted at async callbacks unless the async callback will have been directly called, since unlike non-async callbacks it skips their instruction exploration as they don't contribute to stack depth. It could be the case that the async callback leads to a callchain which exceeds the stack depth, but this is never reachable while only exploring the entry point from main subprog. Hence, repeat the check for the main subprog *and* all async callbacks marked by the symbolic execution pass of the verifier, as execution of the program may begin at any of them. Consider functions with following stack depths: main: 256 async: 256 foo: 256 main: rX = async bpf_timer_set_callback(...) async: foo() Here, async is not descended as it does not contribute to stack depth of main (since it is referenced using bpf_pseudo_func and not bpf_pseudo_call). However, when async is invoked asynchronously, it will end up breaching the MAX_BPF_STACK limit by calling foo. Hence, in addition to main, we also need to explore call chains beginning at all async callback subprogs in a program. Fixes: 7ddc80a476c2 ("bpf: Teach stack depth check about async callbacks.") Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-3-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- kernel/bpf/verifier.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index e682056dd144..02a021c524ab 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -5573,16 +5573,17 @@ static int update_stack_depth(struct bpf_verifier_env *env, * Since recursion is prevented by check_cfg() this algorithm * only needs a local stack of MAX_CALL_FRAMES to remember callsites */ -static int check_max_stack_depth(struct bpf_verifier_env *env) +static int check_max_stack_depth_subprog(struct bpf_verifier_env *env, int idx) { - int depth = 0, frame = 0, idx = 0, i = 0, subprog_end; struct bpf_subprog_info *subprog = env->subprog_info; struct bpf_insn *insn = env->prog->insnsi; + int depth = 0, frame = 0, i, subprog_end; bool tail_call_reachable = false; int ret_insn[MAX_CALL_FRAMES]; int ret_prog[MAX_CALL_FRAMES]; int j; + i = subprog[idx].start; process_func: /* protect against potential stack overflow that might happen when * bpf2bpf calls get combined with tailcalls. Limit the caller's stack @@ -5683,6 +5684,22 @@ continue_func: goto continue_func; } +static int check_max_stack_depth(struct bpf_verifier_env *env) +{ + struct bpf_subprog_info *si = env->subprog_info; + int ret; + + for (int i = 0; i < env->subprog_cnt; i++) { + if (!i || si[i].is_async_cb) { + ret = check_max_stack_depth_subprog(env, i); + if (ret < 0) + return ret; + } + continue; + } + return 0; +} + #ifndef CONFIG_BPF_JIT_ALWAYS_ON static int get_callee_stack_depth(struct bpf_verifier_env *env, const struct bpf_insn *insn, int idx) -- cgit v1.2.3 From 824adae4530b4db1d06987d8dd31a0adef37044f Mon Sep 17 00:00:00 2001 From: Kumar Kartikeya Dwivedi Date: Mon, 17 Jul 2023 21:45:30 +0530 Subject: selftests/bpf: Add more tests for check_max_stack_depth bug Another test which now exercies the path of the verifier where it will explore call chains rooted at the async callback. Without the prior fixes, this program loads successfully, which is incorrect. Signed-off-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/r/20230717161530.1238-4-memxor@gmail.com Signed-off-by: Alexei Starovoitov --- .../selftests/bpf/progs/async_stack_depth.c | 25 ++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/async_stack_depth.c b/tools/testing/selftests/bpf/progs/async_stack_depth.c index 477ba950bb43..3517c0e01206 100644 --- a/tools/testing/selftests/bpf/progs/async_stack_depth.c +++ b/tools/testing/selftests/bpf/progs/async_stack_depth.c @@ -22,9 +22,16 @@ static int timer_cb(void *map, int *key, struct bpf_timer *timer) return buf[69]; } +__attribute__((noinline)) +static int bad_timer_cb(void *map, int *key, struct bpf_timer *timer) +{ + volatile char buf[300] = {}; + return buf[255] + timer_cb(NULL, NULL, NULL); +} + SEC("tc") -__failure __msg("combined stack size of 2 calls") -int prog(struct __sk_buff *ctx) +__failure __msg("combined stack size of 2 calls is 576. Too large") +int pseudo_call_check(struct __sk_buff *ctx) { struct hmap_elem *elem; volatile char buf[256] = {}; @@ -37,4 +44,18 @@ int prog(struct __sk_buff *ctx) return bpf_timer_set_callback(&elem->timer, timer_cb) + buf[0]; } +SEC("tc") +__failure __msg("combined stack size of 2 calls is 608. Too large") +int async_call_root_check(struct __sk_buff *ctx) +{ + struct hmap_elem *elem; + volatile char buf[256] = {}; + + elem = bpf_map_lookup_elem(&hmap, &(int){0}); + if (!elem) + return 0; + + return bpf_timer_set_callback(&elem->timer, bad_timer_cb) + buf[0]; +} + char _license[] SEC("license") = "GPL"; -- cgit v1.2.3 From a3f25d614bc73b45e8f02adc6769876dfd16ca84 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Thu, 13 Jul 2023 09:49:31 -0700 Subject: bpf, arm64: Fix BTI type used for freplace attached functions When running an freplace attached bpf program on an arm64 system w were seeing the following issue: Unhandled 64-bit el1h sync exception on CPU47, ESR 0x0000000036000003 -- BTI After a bit of work to track it down I determined that what appeared to be happening is that the 'bti c' at the start of the program was somehow being reached after a 'br' instruction. Further digging pointed me toward the fact that the function was attached via freplace. This in turn led me to build_plt which I believe is invoking the long jump which is triggering this error. To resolve it we can replace the 'bti c' with 'bti jc' and add a comment explaining why this has to be modified as such. Fixes: b2ad54e1533e ("bpf, arm64: Implement bpf_arch_text_poke() for arm64") Signed-off-by: Alexander Duyck Acked-by: Xu Kuohai Link: https://lore.kernel.org/r/168926677665.316237.9953845318337455525.stgit@ahduyck-xeon-server.home.arpa Signed-off-by: Alexei Starovoitov --- arch/arm64/net/bpf_jit_comp.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 145b540ec34f..ec2174838f2a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -322,7 +322,13 @@ static int build_prologue(struct jit_ctx *ctx, bool ebpf_from_cbpf) * */ - emit_bti(A64_BTI_C, ctx); + /* bpf function may be invoked by 3 instruction types: + * 1. bl, attached via freplace to bpf prog via short jump + * 2. br, attached via freplace to bpf prog via long jump + * 3. blr, working as a function pointer, used by emit_call. + * So BTI_JC should used here to support both br and blr. + */ + emit_bti(A64_BTI_JC, ctx); emit(A64_MOV(1, A64_R(9), A64_LR), ctx); emit(A64_NOP, ctx); -- cgit v1.2.3 From fda05798c22a354efde09a76bdfc276b2d591829 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 13 Jul 2023 23:16:44 +0200 Subject: selftests: tc: set timeout to 15 minutes When looking for something else in LKFT reports [1], I noticed that the TC selftest ended with a timeout error: not ok 1 selftests: tc-testing: tdc.sh # TIMEOUT 45 seconds The timeout had been introduced 3 years ago, see the Fixes commit below. This timeout is only in place when executing the selftests via the kselftests runner scripts. I guess this is not what most TC devs are using and nobody noticed the issue before. The new timeout is set to 15 minutes as suggested by Pedro [2]. It looks like it is plenty more time than what it takes in "normal" conditions. Fixes: 852c8cbf34d3 ("selftests/kselftest/runner.sh: Add 45 second timeout per test") Cc: stable@vger.kernel.org Link: https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20230711/testrun/18267241/suite/kselftest-tc-testing/test/tc-testing_tdc_sh/log [1] Link: https://lore.kernel.org/netdev/0e061d4a-9a23-9f58-3b35-d8919de332d7@tessares.net/T/ [2] Suggested-by: Pedro Tammela Signed-off-by: Matthieu Baerts Reviewed-by: Zhengchao Shao Link: https://lore.kernel.org/r/20230713-tc-selftests-lkft-v1-1-1eb4fd3a96e7@tessares.net Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/settings | 1 + 1 file changed, 1 insertion(+) create mode 100644 tools/testing/selftests/tc-testing/settings diff --git a/tools/testing/selftests/tc-testing/settings b/tools/testing/selftests/tc-testing/settings new file mode 100644 index 000000000000..e2206265f67c --- /dev/null +++ b/tools/testing/selftests/tc-testing/settings @@ -0,0 +1 @@ +timeout=900 -- cgit v1.2.3 From 719b4774a8cb1a501e2d22a5a4a3a0a870e427d5 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 13 Jul 2023 23:16:45 +0200 Subject: selftests: tc: add 'ct' action kconfig dep When looking for something else in LKFT reports [1], I noticed most of the tests were skipped because the "teardown stage" did not complete successfully. Pedro found out this is due to the fact CONFIG_NF_FLOW_TABLE is required but not listed in the 'config' file. Adding it to the list fixes the issues on LKFT side. CONFIG_NET_ACT_CT is now set to 'm' in the final kconfig. Fixes: c34b961a2492 ("net/sched: act_ct: Create nf flow table per zone") Cc: stable@vger.kernel.org Link: https://qa-reports.linaro.org/lkft/linux-next-master/build/next-20230711/testrun/18267241/suite/kselftest-tc-testing/test/tc-testing_tdc_sh/log [1] Link: https://lore.kernel.org/netdev/0e061d4a-9a23-9f58-3b35-d8919de332d7@tessares.net/T/ [2] Suggested-by: Pedro Tammela Signed-off-by: Matthieu Baerts Tested-by: Zhengchao Shao Link: https://lore.kernel.org/r/20230713-tc-selftests-lkft-v1-2-1eb4fd3a96e7@tessares.net Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index 6e73b09c20c8..d1ad29040c02 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -5,6 +5,7 @@ CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_MARK=y CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_FLOW_TABLE=m CONFIG_NF_NAT=m CONFIG_NETFILTER_XT_TARGET_LOG=m -- cgit v1.2.3 From 031c99e71fedcce93b6785d38b7d287bf59e3952 Mon Sep 17 00:00:00 2001 From: Matthieu Baerts Date: Thu, 13 Jul 2023 23:16:46 +0200 Subject: selftests: tc: add ConnTrack procfs kconfig When looking at the TC selftest reports, I noticed one test was failing because /proc/net/nf_conntrack was not available. not ok 373 3992 - Add ct action triggering DNAT tuple conflict Could not match regex pattern. Verify command output: cat: /proc/net/nf_conntrack: No such file or directory It is only available if NF_CONNTRACK_PROCFS kconfig is set. So the issue can be fixed simply by adding it to the list of required kconfig. Fixes: e46905641316 ("tc-testing: add test for ct DNAT tuple collision") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/netdev/0e061d4a-9a23-9f58-3b35-d8919de332d7@tessares.net/T/ [1] Signed-off-by: Matthieu Baerts Tested-by: Zhengchao Shao Link: https://lore.kernel.org/r/20230713-tc-selftests-lkft-v1-3-1eb4fd3a96e7@tessares.net Acked-by: Jamal Hadi Salim Signed-off-by: Jakub Kicinski --- tools/testing/selftests/tc-testing/config | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/tc-testing/config b/tools/testing/selftests/tc-testing/config index d1ad29040c02..71706197ba0f 100644 --- a/tools/testing/selftests/tc-testing/config +++ b/tools/testing/selftests/tc-testing/config @@ -5,6 +5,7 @@ CONFIG_NF_CONNTRACK=m CONFIG_NF_CONNTRACK_MARK=y CONFIG_NF_CONNTRACK_ZONES=y CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CONNTRACK_PROCFS=y CONFIG_NF_FLOW_TABLE=m CONFIG_NF_NAT=m CONFIG_NETFILTER_XT_TARGET_LOG=m -- cgit v1.2.3 From d1998e505a995f5305a6add46f3d806fa38dae06 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 17 Jul 2023 12:32:42 -0700 Subject: mailmap: add entries for past lives Update old emails for my current work email. Signed-off-by: Shannon Nelson Link: https://lore.kernel.org/r/20230717193242.43670-1-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- .mailmap | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.mailmap b/.mailmap index 1bce47a7f2ce..3af557de8e73 100644 --- a/.mailmap +++ b/.mailmap @@ -453,6 +453,8 @@ Sebastian Reichel Sedat Dilek Seth Forshee Shannon Nelson +Shannon Nelson +Shannon Nelson Shiraz Hashim Shuah Khan Shuah Khan -- cgit v1.2.3 From 195e903b342a73c08c3249cec55b07bf3f23200a Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Mon, 17 Jul 2023 10:33:06 -0700 Subject: mailmap: Add entry for old intel email Fix old email to avoid bouncing email from net/drivers and older netdev work. Anyways my @intel email hasn't been active for years. Signed-off-by: John Fastabend Link: https://lore.kernel.org/r/20230717173306.38407-1-john.fastabend@gmail.com Signed-off-by: Jakub Kicinski --- .mailmap | 1 + 1 file changed, 1 insertion(+) diff --git a/.mailmap b/.mailmap index 3af557de8e73..bcabc8a35122 100644 --- a/.mailmap +++ b/.mailmap @@ -241,6 +241,7 @@ Jisheng Zhang Johan Hovold Johan Hovold John Crispin +John Fastabend John Keeping John Paul Adrian Glaubitz John Stultz -- cgit v1.2.3 From 78adb4bcf99effbb960c5f9091e2e062509d1030 Mon Sep 17 00:00:00 2001 From: Florian Kauer Date: Mon, 17 Jul 2023 10:54:44 -0700 Subject: igc: Prevent garbled TX queue with XDP ZEROCOPY In normal operation, each populated queue item has next_to_watch pointing to the last TX desc of the packet, while each cleaned item has it set to 0. In particular, next_to_use that points to the next (necessarily clean) item to use has next_to_watch set to 0. When the TX queue is used both by an application using AF_XDP with ZEROCOPY as well as a second non-XDP application generating high traffic, the queue pointers can get in an invalid state where next_to_use points to an item where next_to_watch is NOT set to 0. However, the implementation assumes at several places that this is never the case, so if it does hold, bad things happen. In particular, within the loop inside of igc_clean_tx_irq(), next_to_clean can overtake next_to_use. Finally, this prevents any further transmission via this queue and it never gets unblocked or signaled. Secondly, if the queue is in this garbled state, the inner loop of igc_clean_tx_ring() will never terminate, completely hogging a CPU core. The reason is that igc_xdp_xmit_zc() reads next_to_use before acquiring the lock, and writing it back (potentially unmodified) later. If it got modified before locking, the outdated next_to_use is written pointing to an item that was already used elsewhere (and thus next_to_watch got written). Fixes: 9acf59a752d4 ("igc: Enable TX via AF_XDP zero-copy") Signed-off-by: Florian Kauer Reviewed-by: Kurt Kanzenbach Tested-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Reviewed-by: Simon Horman Tested-by: Naama Meir Signed-off-by: Tony Nguyen Link: https://lore.kernel.org/r/20230717175444.3217831-1-anthony.l.nguyen@intel.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/intel/igc/igc_main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 9f93f0f4f752..f36bc2a1849a 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -2828,9 +2828,8 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring) struct netdev_queue *nq = txring_txq(ring); union igc_adv_tx_desc *tx_desc = NULL; int cpu = smp_processor_id(); - u16 ntu = ring->next_to_use; struct xdp_desc xdp_desc; - u16 budget; + u16 budget, ntu; if (!netif_carrier_ok(ring->netdev)) return; @@ -2840,6 +2839,7 @@ static void igc_xdp_xmit_zc(struct igc_ring *ring) /* Avoid transmit queue timeout since we share it with the slow path */ txq_trans_cond_update(nq); + ntu = ring->next_to_use; budget = igc_desc_unused(ring); while (xsk_tx_peek_desc(pool, &xdp_desc) && budget--) { -- cgit v1.2.3 From e7002b3b20c58bce4a88c15aca8e6cc894e3a7ed Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Mon, 17 Jul 2023 11:46:43 +0530 Subject: octeontx2-pf: mcs: Generate hash key using ecb(aes) Hardware generated encryption and ICV tags are found to be wrong when tested with IEEE MACSEC test vectors. This is because as per the HRM, the hash key (derived by AES-ECB block encryption of an all 0s block with the SAK) has to be programmed by the software in MCSX_RS_MCS_CPM_TX_SLAVE_SA_PLCY_MEM_4X register. Hence fix this by generating hash key in software and configuring in hardware. Fixes: c54ffc73601c ("octeontx2-pf: mcs: Introduce MACSEC hardware offloading") Signed-off-by: Subbaraya Sundeep Reviewed-by: Kalesh AP Link: https://lore.kernel.org/r/1689574603-28093-1-git-send-email-sbhatta@marvell.com Signed-off-by: Jakub Kicinski --- .../ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 137 +++++++++++++++------ 1 file changed, 100 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index 6e2fb24be8c1..59b138214af2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -4,6 +4,7 @@ * Copyright (C) 2022 Marvell. */ +#include #include #include #include "otx2_common.h" @@ -42,6 +43,56 @@ #define MCS_TCI_E 0x08 /* encryption */ #define MCS_TCI_C 0x04 /* changed text */ +#define CN10K_MAX_HASH_LEN 16 +#define CN10K_MAX_SAK_LEN 32 + +static int cn10k_ecb_aes_encrypt(struct otx2_nic *pfvf, u8 *sak, + u16 sak_len, u8 *hash) +{ + u8 data[CN10K_MAX_HASH_LEN] = { 0 }; + struct skcipher_request *req = NULL; + struct scatterlist sg_src, sg_dst; + struct crypto_skcipher *tfm; + DECLARE_CRYPTO_WAIT(wait); + int err; + + tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + if (IS_ERR(tfm)) { + dev_err(pfvf->dev, "failed to allocate transform for ecb-aes\n"); + return PTR_ERR(tfm); + } + + req = skcipher_request_alloc(tfm, GFP_KERNEL); + if (!req) { + dev_err(pfvf->dev, "failed to allocate request for skcipher\n"); + err = -ENOMEM; + goto free_tfm; + } + + err = crypto_skcipher_setkey(tfm, sak, sak_len); + if (err) { + dev_err(pfvf->dev, "failed to set key for skcipher\n"); + goto free_req; + } + + /* build sg list */ + sg_init_one(&sg_src, data, CN10K_MAX_HASH_LEN); + sg_init_one(&sg_dst, hash, CN10K_MAX_HASH_LEN); + + skcipher_request_set_callback(req, 0, crypto_req_done, &wait); + skcipher_request_set_crypt(req, &sg_src, &sg_dst, + CN10K_MAX_HASH_LEN, NULL); + + err = crypto_skcipher_encrypt(req); + err = crypto_wait_req(err, &wait); + +free_req: + skcipher_request_free(req); +free_tfm: + crypto_free_skcipher(tfm); + return err; +} + static struct cn10k_mcs_txsc *cn10k_mcs_get_txsc(struct cn10k_mcs_cfg *cfg, struct macsec_secy *secy) { @@ -330,19 +381,53 @@ fail: return ret; } +static int cn10k_mcs_write_keys(struct otx2_nic *pfvf, + struct macsec_secy *secy, + struct mcs_sa_plcy_write_req *req, + u8 *sak, u8 *salt, ssci_t ssci) +{ + u8 hash_rev[CN10K_MAX_HASH_LEN]; + u8 sak_rev[CN10K_MAX_SAK_LEN]; + u8 salt_rev[MACSEC_SALT_LEN]; + u8 hash[CN10K_MAX_HASH_LEN]; + u32 ssci_63_32; + int err, i; + + err = cn10k_ecb_aes_encrypt(pfvf, sak, secy->key_len, hash); + if (err) { + dev_err(pfvf->dev, "Generating hash using ECB(AES) failed\n"); + return err; + } + + for (i = 0; i < secy->key_len; i++) + sak_rev[i] = sak[secy->key_len - 1 - i]; + + for (i = 0; i < CN10K_MAX_HASH_LEN; i++) + hash_rev[i] = hash[CN10K_MAX_HASH_LEN - 1 - i]; + + for (i = 0; i < MACSEC_SALT_LEN; i++) + salt_rev[i] = salt[MACSEC_SALT_LEN - 1 - i]; + + ssci_63_32 = (__force u32)cpu_to_be32((__force u32)ssci); + + memcpy(&req->plcy[0][0], sak_rev, secy->key_len); + memcpy(&req->plcy[0][4], hash_rev, CN10K_MAX_HASH_LEN); + memcpy(&req->plcy[0][6], salt_rev, MACSEC_SALT_LEN); + req->plcy[0][7] |= (u64)ssci_63_32 << 32; + + return 0; +} + static int cn10k_mcs_write_rx_sa_plcy(struct otx2_nic *pfvf, struct macsec_secy *secy, struct cn10k_mcs_rxsc *rxsc, u8 assoc_num, bool sa_in_use) { - unsigned char *src = rxsc->sa_key[assoc_num]; struct mcs_sa_plcy_write_req *plcy_req; - u8 *salt_p = rxsc->salt[assoc_num]; + u8 *sak = rxsc->sa_key[assoc_num]; + u8 *salt = rxsc->salt[assoc_num]; struct mcs_rx_sc_sa_map *map_req; struct mbox *mbox = &pfvf->mbox; - u64 ssci_salt_95_64 = 0; - u8 reg, key_len; - u64 salt_63_0; int ret; mutex_lock(&mbox->lock); @@ -360,20 +445,10 @@ static int cn10k_mcs_write_rx_sa_plcy(struct otx2_nic *pfvf, goto fail; } - for (reg = 0, key_len = 0; key_len < secy->key_len; key_len += 8) { - memcpy((u8 *)&plcy_req->plcy[0][reg], - (src + reg * 8), 8); - reg++; - } - - if (secy->xpn) { - memcpy((u8 *)&salt_63_0, salt_p, 8); - memcpy((u8 *)&ssci_salt_95_64, salt_p + 8, 4); - ssci_salt_95_64 |= (__force u64)rxsc->ssci[assoc_num] << 32; - - plcy_req->plcy[0][6] = salt_63_0; - plcy_req->plcy[0][7] = ssci_salt_95_64; - } + ret = cn10k_mcs_write_keys(pfvf, secy, plcy_req, sak, + salt, rxsc->ssci[assoc_num]); + if (ret) + goto fail; plcy_req->sa_index[0] = rxsc->hw_sa_id[assoc_num]; plcy_req->sa_cnt = 1; @@ -586,13 +661,10 @@ static int cn10k_mcs_write_tx_sa_plcy(struct otx2_nic *pfvf, struct cn10k_mcs_txsc *txsc, u8 assoc_num) { - unsigned char *src = txsc->sa_key[assoc_num]; struct mcs_sa_plcy_write_req *plcy_req; - u8 *salt_p = txsc->salt[assoc_num]; + u8 *sak = txsc->sa_key[assoc_num]; + u8 *salt = txsc->salt[assoc_num]; struct mbox *mbox = &pfvf->mbox; - u64 ssci_salt_95_64 = 0; - u8 reg, key_len; - u64 salt_63_0; int ret; mutex_lock(&mbox->lock); @@ -603,19 +675,10 @@ static int cn10k_mcs_write_tx_sa_plcy(struct otx2_nic *pfvf, goto fail; } - for (reg = 0, key_len = 0; key_len < secy->key_len; key_len += 8) { - memcpy((u8 *)&plcy_req->plcy[0][reg], (src + reg * 8), 8); - reg++; - } - - if (secy->xpn) { - memcpy((u8 *)&salt_63_0, salt_p, 8); - memcpy((u8 *)&ssci_salt_95_64, salt_p + 8, 4); - ssci_salt_95_64 |= (__force u64)txsc->ssci[assoc_num] << 32; - - plcy_req->plcy[0][6] = salt_63_0; - plcy_req->plcy[0][7] = ssci_salt_95_64; - } + ret = cn10k_mcs_write_keys(pfvf, secy, plcy_req, sak, + salt, txsc->ssci[assoc_num]); + if (ret) + goto fail; plcy_req->plcy[0][8] = assoc_num; plcy_req->sa_index[0] = txsc->hw_sa_id[assoc_num]; -- cgit v1.2.3 From 5e5265522a9a7f91d1b0bd411d634bdaf16c80cd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Jul 2023 14:44:44 +0000 Subject: tcp: annotate data-races around tcp_rsk(req)->txhash TCP request sockets are lockless, some of their fields can change while being read by another cpu as syzbot noticed. This is usually harmless, but we should annotate the known races. This patch takes care of tcp_rsk(req)->txhash, a separate one is needed for tcp_rsk(req)->ts_recent. BUG: KCSAN: data-race in tcp_make_synack / tcp_rtx_synack write to 0xffff8881362304bc of 4 bytes by task 32083 on cpu 1: tcp_rtx_synack+0x9d/0x2a0 net/ipv4/tcp_output.c:4213 inet_rtx_syn_ack+0x38/0x80 net/ipv4/inet_connection_sock.c:880 tcp_check_req+0x379/0xc70 net/ipv4/tcp_minisocks.c:665 tcp_v6_rcv+0x125b/0x1b20 net/ipv6/tcp_ipv6.c:1673 ip6_protocol_deliver_rcu+0x92f/0xf30 net/ipv6/ip6_input.c:437 ip6_input_finish net/ipv6/ip6_input.c:482 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip6_input+0xbd/0x1b0 net/ipv6/ip6_input.c:491 dst_input include/net/dst.h:468 [inline] ip6_rcv_finish+0x1e2/0x2e0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:303 [inline] ipv6_rcv+0x74/0x150 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5452 [inline] __netif_receive_skb+0x90/0x1b0 net/core/dev.c:5566 netif_receive_skb_internal net/core/dev.c:5652 [inline] netif_receive_skb+0x4a/0x310 net/core/dev.c:5711 tun_rx_batched+0x3bf/0x400 tun_get_user+0x1d24/0x22b0 drivers/net/tun.c:1997 tun_chr_write_iter+0x18e/0x240 drivers/net/tun.c:2043 call_write_iter include/linux/fs.h:1871 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x4ab/0x7d0 fs/read_write.c:584 ksys_write+0xeb/0x1a0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x42/0x50 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd read to 0xffff8881362304bc of 4 bytes by task 32078 on cpu 0: tcp_make_synack+0x367/0xb40 net/ipv4/tcp_output.c:3663 tcp_v6_send_synack+0x72/0x420 net/ipv6/tcp_ipv6.c:544 tcp_conn_request+0x11a8/0x1560 net/ipv4/tcp_input.c:7059 tcp_v6_conn_request+0x13f/0x180 net/ipv6/tcp_ipv6.c:1175 tcp_rcv_state_process+0x156/0x1de0 net/ipv4/tcp_input.c:6494 tcp_v6_do_rcv+0x98a/0xb70 net/ipv6/tcp_ipv6.c:1509 tcp_v6_rcv+0x17b8/0x1b20 net/ipv6/tcp_ipv6.c:1735 ip6_protocol_deliver_rcu+0x92f/0xf30 net/ipv6/ip6_input.c:437 ip6_input_finish net/ipv6/ip6_input.c:482 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip6_input+0xbd/0x1b0 net/ipv6/ip6_input.c:491 dst_input include/net/dst.h:468 [inline] ip6_rcv_finish+0x1e2/0x2e0 net/ipv6/ip6_input.c:79 NF_HOOK include/linux/netfilter.h:303 [inline] ipv6_rcv+0x74/0x150 net/ipv6/ip6_input.c:309 __netif_receive_skb_one_core net/core/dev.c:5452 [inline] __netif_receive_skb+0x90/0x1b0 net/core/dev.c:5566 netif_receive_skb_internal net/core/dev.c:5652 [inline] netif_receive_skb+0x4a/0x310 net/core/dev.c:5711 tun_rx_batched+0x3bf/0x400 tun_get_user+0x1d24/0x22b0 drivers/net/tun.c:1997 tun_chr_write_iter+0x18e/0x240 drivers/net/tun.c:2043 call_write_iter include/linux/fs.h:1871 [inline] new_sync_write fs/read_write.c:491 [inline] vfs_write+0x4ab/0x7d0 fs/read_write.c:584 ksys_write+0xeb/0x1a0 fs/read_write.c:637 __do_sys_write fs/read_write.c:649 [inline] __se_sys_write fs/read_write.c:646 [inline] __x64_sys_write+0x42/0x50 fs/read_write.c:646 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x91d25731 -> 0xe79325cd Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 32078 Comm: syz-executor.4 Not tainted 6.5.0-rc1-syzkaller-00033-geb26cbb1a754 #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/03/2023 Fixes: 58d607d3e52f ("tcp: provide skb->hash to synack packets") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230717144445.653164-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 3 ++- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 4 ++-- net/ipv6/tcp_ipv6.c | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fd365de4d5ff..fa04ff49100b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -992,7 +992,8 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, - ip_hdr(skb)->tos, tcp_rsk(req)->txhash); + ip_hdr(skb)->tos, + READ_ONCE(tcp_rsk(req)->txhash)); } /* diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 04fc328727e6..ec05f277ce2e 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -528,7 +528,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newicsk->icsk_ack.lrcvtime = tcp_jiffies32; newtp->lsndtime = tcp_jiffies32; - newsk->sk_txhash = treq->txhash; + newsk->sk_txhash = READ_ONCE(treq->txhash); newtp->total_retrans = req->num_retrans; tcp_init_xmit_timers(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 2cb39b6dad02..3b09cd13e2db 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -3660,7 +3660,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, rcu_read_lock(); md5 = tcp_rsk(req)->af_specific->req_md5_lookup(sk, req_to_sk(req)); #endif - skb_set_hash(skb, tcp_rsk(req)->txhash, PKT_HASH_TYPE_L4); + skb_set_hash(skb, READ_ONCE(tcp_rsk(req)->txhash), PKT_HASH_TYPE_L4); /* bpf program will be interested in the tcp_flags */ TCP_SKB_CB(skb)->tcp_flags = TCPHDR_SYN | TCPHDR_ACK; tcp_header_size = tcp_synack_options(sk, req, mss, skb, &opts, md5, @@ -4210,7 +4210,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) /* Paired with WRITE_ONCE() in sock_setsockopt() */ if (READ_ONCE(sk->sk_txrehash) == SOCK_TXREHASH_ENABLED) - tcp_rsk(req)->txhash = net_tx_rndhash(); + WRITE_ONCE(tcp_rsk(req)->txhash, net_tx_rndhash()); res = af_ops->send_synack(sk, NULL, &fl, req, NULL, TCP_SYNACK_NORMAL, NULL); if (!res) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 40dd92a2f480..eb96a8010414 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1129,7 +1129,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, req->ts_recent, sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority, - tcp_rsk(req)->txhash); + READ_ONCE(tcp_rsk(req)->txhash)); } -- cgit v1.2.3 From eba20811f32652bc1a52d5e7cc403859b86390d9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 17 Jul 2023 14:44:45 +0000 Subject: tcp: annotate data-races around tcp_rsk(req)->ts_recent TCP request sockets are lockless, tcp_rsk(req)->ts_recent can change while being read by another cpu as syzbot noticed. This is harmless, but we should annotate the known races. Note that tcp_check_req() changes req->ts_recent a bit early, we might change this in the future. BUG: KCSAN: data-race in tcp_check_req / tcp_check_req write to 0xffff88813c8afb84 of 4 bytes by interrupt on cpu 1: tcp_check_req+0x694/0xc70 net/ipv4/tcp_minisocks.c:762 tcp_v4_rcv+0x12db/0x1b70 net/ipv4/tcp_ipv4.c:2071 ip_protocol_deliver_rcu+0x356/0x6d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x13c/0x1a0 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:303 [inline] ip_local_deliver+0xec/0x1c0 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:468 [inline] ip_rcv_finish net/ipv4/ip_input.c:449 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip_rcv+0x197/0x270 net/ipv4/ip_input.c:569 __netif_receive_skb_one_core net/core/dev.c:5493 [inline] __netif_receive_skb+0x90/0x1b0 net/core/dev.c:5607 process_backlog+0x21f/0x380 net/core/dev.c:5935 __napi_poll+0x60/0x3b0 net/core/dev.c:6498 napi_poll net/core/dev.c:6565 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6698 __do_softirq+0xc1/0x265 kernel/softirq.c:571 do_softirq+0x7e/0xb0 kernel/softirq.c:472 __local_bh_enable_ip+0x64/0x70 kernel/softirq.c:396 local_bh_enable+0x1f/0x20 include/linux/bottom_half.h:33 rcu_read_unlock_bh include/linux/rcupdate.h:843 [inline] __dev_queue_xmit+0xabb/0x1d10 net/core/dev.c:4271 dev_queue_xmit include/linux/netdevice.h:3088 [inline] neigh_hh_output include/net/neighbour.h:528 [inline] neigh_output include/net/neighbour.h:542 [inline] ip_finish_output2+0x700/0x840 net/ipv4/ip_output.c:229 ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:317 NF_HOOK_COND include/linux/netfilter.h:292 [inline] ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:431 dst_output include/net/dst.h:458 [inline] ip_local_out net/ipv4/ip_output.c:126 [inline] __ip_queue_xmit+0xa4d/0xa70 net/ipv4/ip_output.c:533 ip_queue_xmit+0x38/0x40 net/ipv4/ip_output.c:547 __tcp_transmit_skb+0x1194/0x16e0 net/ipv4/tcp_output.c:1399 tcp_transmit_skb net/ipv4/tcp_output.c:1417 [inline] tcp_write_xmit+0x13ff/0x2fd0 net/ipv4/tcp_output.c:2693 __tcp_push_pending_frames+0x6a/0x1a0 net/ipv4/tcp_output.c:2877 tcp_push_pending_frames include/net/tcp.h:1952 [inline] __tcp_sock_set_cork net/ipv4/tcp.c:3336 [inline] tcp_sock_set_cork+0xe8/0x100 net/ipv4/tcp.c:3343 rds_tcp_xmit_path_complete+0x3b/0x40 net/rds/tcp_send.c:52 rds_send_xmit+0xf8d/0x1420 net/rds/send.c:422 rds_send_worker+0x42/0x1d0 net/rds/threads.c:200 process_one_work+0x3e6/0x750 kernel/workqueue.c:2408 worker_thread+0x5f2/0xa10 kernel/workqueue.c:2555 kthread+0x1d7/0x210 kernel/kthread.c:379 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 read to 0xffff88813c8afb84 of 4 bytes by interrupt on cpu 0: tcp_check_req+0x32a/0xc70 net/ipv4/tcp_minisocks.c:622 tcp_v4_rcv+0x12db/0x1b70 net/ipv4/tcp_ipv4.c:2071 ip_protocol_deliver_rcu+0x356/0x6d0 net/ipv4/ip_input.c:205 ip_local_deliver_finish+0x13c/0x1a0 net/ipv4/ip_input.c:233 NF_HOOK include/linux/netfilter.h:303 [inline] ip_local_deliver+0xec/0x1c0 net/ipv4/ip_input.c:254 dst_input include/net/dst.h:468 [inline] ip_rcv_finish net/ipv4/ip_input.c:449 [inline] NF_HOOK include/linux/netfilter.h:303 [inline] ip_rcv+0x197/0x270 net/ipv4/ip_input.c:569 __netif_receive_skb_one_core net/core/dev.c:5493 [inline] __netif_receive_skb+0x90/0x1b0 net/core/dev.c:5607 process_backlog+0x21f/0x380 net/core/dev.c:5935 __napi_poll+0x60/0x3b0 net/core/dev.c:6498 napi_poll net/core/dev.c:6565 [inline] net_rx_action+0x32b/0x750 net/core/dev.c:6698 __do_softirq+0xc1/0x265 kernel/softirq.c:571 run_ksoftirqd+0x17/0x20 kernel/softirq.c:939 smpboot_thread_fn+0x30a/0x4a0 kernel/smpboot.c:164 kthread+0x1d7/0x210 kernel/kthread.c:379 ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 value changed: 0x1cd237f1 -> 0x1cd237f2 Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table") Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230717144445.653164-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp_ipv4.c | 2 +- net/ipv4/tcp_minisocks.c | 9 ++++++--- net/ipv4/tcp_output.c | 2 +- net/ipv6/tcp_ipv6.c | 2 +- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index fa04ff49100b..b5c81cf5b86f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -988,7 +988,7 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, - req->ts_recent, + READ_ONCE(req->ts_recent), 0, tcp_md5_do_lookup(sk, l3index, addr, AF_INET), inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0, diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index ec05f277ce2e..c8f2aa003387 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -555,7 +555,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->max_window = newtp->snd_wnd; if (newtp->rx_opt.tstamp_ok) { - newtp->rx_opt.ts_recent = req->ts_recent; + newtp->rx_opt.ts_recent = READ_ONCE(req->ts_recent); newtp->rx_opt.ts_recent_stamp = ktime_get_seconds(); newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED; } else { @@ -619,7 +619,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, tcp_parse_options(sock_net(sk), skb, &tmp_opt, 0, NULL); if (tmp_opt.saw_tstamp) { - tmp_opt.ts_recent = req->ts_recent; + tmp_opt.ts_recent = READ_ONCE(req->ts_recent); if (tmp_opt.rcv_tsecr) tmp_opt.rcv_tsecr -= tcp_rsk(req)->ts_off; /* We do not store true stamp, but it is not required, @@ -758,8 +758,11 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, /* In sequence, PAWS is OK. */ + /* TODO: We probably should defer ts_recent change once + * we take ownership of @req. + */ if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_nxt)) - req->ts_recent = tmp_opt.rcv_tsval; + WRITE_ONCE(req->ts_recent, tmp_opt.rcv_tsval); if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) { /* Truncate SYN, it is out of window starting diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3b09cd13e2db..51d8638d4b4c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -878,7 +878,7 @@ static unsigned int tcp_synack_options(const struct sock *sk, if (likely(ireq->tstamp_ok)) { opts->options |= OPTION_TS; opts->tsval = tcp_skb_timestamp(skb) + tcp_rsk(req)->ts_off; - opts->tsecr = req->ts_recent; + opts->tsecr = READ_ONCE(req->ts_recent); remaining -= TCPOLEN_TSTAMP_ALIGNED; } if (likely(ireq->sack_ok)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index eb96a8010414..4714eb695913 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1126,7 +1126,7 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale, tcp_time_stamp_raw() + tcp_rsk(req)->ts_off, - req->ts_recent, sk->sk_bound_dev_if, + READ_ONCE(req->ts_recent), sk->sk_bound_dev_if, tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr, l3index), ipv6_get_dsfield(ipv6_hdr(skb)), 0, sk->sk_priority, READ_ONCE(tcp_rsk(req)->txhash)); -- cgit v1.2.3 From daa751444fd9d4184270b1479d8af49aaf1a1ee6 Mon Sep 17 00:00:00 2001 From: Wang Ming Date: Mon, 17 Jul 2023 17:59:19 +0800 Subject: net: ipv4: Use kfree_sensitive instead of kfree key might contain private part of the key, so better use kfree_sensitive to free it. Fixes: 38320c70d282 ("[IPSEC]: Use crypto_aead and authenc in ESP") Signed-off-by: Wang Ming Reviewed-by: Tariq Toukan Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/esp4.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index ba06ed42e428..2be2d4922557 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -1132,7 +1132,7 @@ static int esp_init_authenc(struct xfrm_state *x, err = crypto_aead_setkey(aead, key, keylen); free_key: - kfree(key); + kfree_sensitive(key); error: return err; -- cgit v1.2.3 From 4258faa130be4ea43e5e2d839467da421b8ff274 Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Mon, 17 Jul 2023 22:45:19 +0800 Subject: net:ipv6: check return value of pskb_trim() goto tx_err if an unexpected result is returned by pskb_tirm() in ip6erspan_tunnel_xmit(). Fixes: 5a963eb61b7c ("ip6_gre: Add ERSPAN native tunnel support") Signed-off-by: Yuanjun Gong Reviewed-by: David Ahern Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index da80974ad23a..070d87abf7c0 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -955,7 +955,8 @@ static netdev_tx_t ip6erspan_tunnel_xmit(struct sk_buff *skb, goto tx_err; if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto tx_err; truncate = true; } -- cgit v1.2.3 From 78a93c31003cc53aca5d67b1bbe2d5b9fc37cc4d Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Mon, 17 Jul 2023 22:46:21 +0800 Subject: drivers: net: fix return value check in emac_tso_csum() in emac_tso_csum(), return an error code if an unexpected value is returned by pskb_trim(). Signed-off-by: Yuanjun Gong Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- drivers/net/ethernet/qualcomm/emac/emac-mac.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index 0d80447d4d3b..d5c688a8d7be 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -1260,8 +1260,11 @@ static int emac_tso_csum(struct emac_adapter *adpt, if (skb->protocol == htons(ETH_P_IP)) { u32 pkt_len = ((unsigned char *)ip_hdr(skb) - skb->data) + ntohs(ip_hdr(skb)->tot_len); - if (skb->len > pkt_len) - pskb_trim(skb, pkt_len); + if (skb->len > pkt_len) { + ret = pskb_trim(skb, pkt_len); + if (unlikely(ret)) + return ret; + } } hdr_len = skb_tcp_all_headers(skb); -- cgit v1.2.3 From bce5603365d8184734ba7e6b22e74bd2c90a7167 Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Mon, 17 Jul 2023 22:46:52 +0800 Subject: drivers:net: fix return value check in ocelot_fdma_receive_skb ocelot_fdma_receive_skb should return false if an unexpected value is returned by pskb_trim. Signed-off-by: Yuanjun Gong Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot_fdma.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c index 8e3894cf5f7c..83a3ce0c568e 100644 --- a/drivers/net/ethernet/mscc/ocelot_fdma.c +++ b/drivers/net/ethernet/mscc/ocelot_fdma.c @@ -368,7 +368,8 @@ static bool ocelot_fdma_receive_skb(struct ocelot *ocelot, struct sk_buff *skb) if (unlikely(!ndev)) return false; - pskb_trim(skb, skb->len - ETH_FCS_LEN); + if (pskb_trim(skb, skb->len - ETH_FCS_LEN)) + return false; skb->dev = ndev; skb->protocol = eth_type_trans(skb, skb->dev); -- cgit v1.2.3 From 02d84f3eb53a5be982b17c88410fa6c58806356b Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Mon, 17 Jul 2023 22:49:02 +0800 Subject: ipv4: ip_gre: fix return value check in erspan_fb_xmit() goto err_free_skb if an unexpected result is returned by pskb_tirm() in erspan_fb_xmit(). Signed-off-by: Yuanjun Gong Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 81a1cce1a7d1..914cc941af55 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -548,7 +548,8 @@ static void erspan_fb_xmit(struct sk_buff *skb, struct net_device *dev) goto err_free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto err_free_skb; truncate = true; } -- cgit v1.2.3 From aa7cb3789b429d4fdfbe767e0e0cf8c769299d7a Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Mon, 17 Jul 2023 22:49:18 +0800 Subject: ipv4: ip_gre: fix return value check in erspan_xmit() goto free_skb if an unexpected result is returned by pskb_tirm() in erspan_xmit(). Signed-off-by: Yuanjun Gong Reviewed-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 914cc941af55..22a26d1d29a0 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -690,7 +690,8 @@ static netdev_tx_t erspan_xmit(struct sk_buff *skb, goto free_skb; if (skb->len > dev->mtu + dev->hard_header_len) { - pskb_trim(skb, dev->mtu + dev->hard_header_len); + if (pskb_trim(skb, dev->mtu + dev->hard_header_len)) + goto free_skb; truncate = true; } -- cgit v1.2.3 From 81b3ade5d2b98ad6e0a473b0e1e420a801275592 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 17 Jul 2023 14:59:18 -0700 Subject: Revert "tcp: avoid the lookup process failing to get sk in ehash table" This reverts commit 3f4ca5fafc08881d7a57daa20449d171f2887043. Commit 3f4ca5fafc08 ("tcp: avoid the lookup process failing to get sk in ehash table") reversed the order in how a socket is inserted into ehash to fix an issue that ehash-lookup could fail when reqsk/full sk/twsk are swapped. However, it introduced another lookup failure. The full socket in ehash is allocated from a slab with SLAB_TYPESAFE_BY_RCU and does not have SOCK_RCU_FREE, so the socket could be reused even while it is being referenced on another CPU doing RCU lookup. Let's say a socket is reused and inserted into the same hash bucket during lookup. After the blamed commit, a new socket is inserted at the end of the list. If that happens, we will skip sockets placed after the previous position of the reused socket, resulting in ehash lookup failure. As described in Documentation/RCU/rculist_nulls.rst, we should insert a new socket at the head of the list to avoid such an issue. This issue, the swap-lookup-failure, and another variant reported in [0] can all be handled properly by adding a locked ehash lookup suggested by Eric Dumazet [1]. However, this issue could occur for every packet, thus more likely than the other two races, so let's revert the change for now. Link: https://lore.kernel.org/netdev/20230606064306.9192-1-duanmuquan@baidu.com/ [0] Link: https://lore.kernel.org/netdev/CANn89iK8snOz8TYOhhwfimC7ykYA78GA3Nyv8x06SZYa1nKdyA@mail.gmail.com/ [1] Fixes: 3f4ca5fafc08 ("tcp: avoid the lookup process failing to get sk in ehash table") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20230717215918.15723-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 17 ++--------------- net/ipv4/inet_timewait_sock.c | 8 ++++---- 2 files changed, 6 insertions(+), 19 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e7391bf310a7..0819d6001b9a 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -650,20 +650,8 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) spin_lock(lock); if (osk) { WARN_ON_ONCE(sk->sk_hash != osk->sk_hash); - ret = sk_hashed(osk); - if (ret) { - /* Before deleting the node, we insert a new one to make - * sure that the look-up-sk process would not miss either - * of them and that at least one node would exist in ehash - * table all the time. Otherwise there's a tiny chance - * that lookup process could find nothing in ehash table. - */ - __sk_nulls_add_node_tail_rcu(sk, list); - sk_nulls_del_node_init_rcu(osk); - } - goto unlock; - } - if (found_dup_sk) { + ret = sk_nulls_del_node_init_rcu(osk); + } else if (found_dup_sk) { *found_dup_sk = inet_ehash_lookup_by_sk(sk, list); if (*found_dup_sk) ret = false; @@ -672,7 +660,6 @@ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) if (ret) __sk_nulls_add_node_rcu(sk, list); -unlock: spin_unlock(lock); return ret; diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 40052414c7c7..2c1b245dba8e 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -88,10 +88,10 @@ void inet_twsk_put(struct inet_timewait_sock *tw) } EXPORT_SYMBOL_GPL(inet_twsk_put); -static void inet_twsk_add_node_tail_rcu(struct inet_timewait_sock *tw, - struct hlist_nulls_head *list) +static void inet_twsk_add_node_rcu(struct inet_timewait_sock *tw, + struct hlist_nulls_head *list) { - hlist_nulls_add_tail_rcu(&tw->tw_node, list); + hlist_nulls_add_head_rcu(&tw->tw_node, list); } static void inet_twsk_add_bind_node(struct inet_timewait_sock *tw, @@ -144,7 +144,7 @@ void inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, spin_lock(lock); - inet_twsk_add_node_tail_rcu(tw, &ehead->chain); + inet_twsk_add_node_rcu(tw, &ehead->chain); /* Step 3: Remove SK from hash chain */ if (__sk_nulls_del_node_init_rcu(sk)) -- cgit v1.2.3 From cf2ffdea0839398cb0551762af7f5efb0a6e0fea Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 18 Jul 2023 13:11:31 +0200 Subject: r8169: revert 2ab19de62d67 ("r8169: remove ASPM restrictions now that ASPM is disabled during NAPI poll") There have been reports that on a number of systems this change breaks network connectivity. Therefore effectively revert it. Mainly affected seem to be systems where BIOS denies ASPM access to OS. Due to later changes we can't do a direct revert. Fixes: 2ab19de62d67 ("r8169: remove ASPM restrictions now that ASPM is disabled during NAPI poll") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/netdev/e47bac0d-e802-65e1-b311-6acb26d5cf10@freenet.de/T/ Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217596 Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/57f13ec0-b216-d5d8-363d-5b05528ec5fb@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index fce4a2b908c2..8a8b7d8a5c3f 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -623,6 +623,7 @@ struct rtl8169_private { int cfg9346_usage_count; unsigned supports_gmii:1; + unsigned aspm_manageable:1; dma_addr_t counters_phys_addr; struct rtl8169_counters *counters; struct rtl8169_tc_offsets tc_offset; @@ -2746,7 +2747,8 @@ static void rtl_hw_aspm_clkreq_enable(struct rtl8169_private *tp, bool enable) if (tp->mac_version < RTL_GIGA_MAC_VER_32) return; - if (enable) { + /* Don't enable ASPM in the chip if OS can't control ASPM */ + if (enable && tp->aspm_manageable) { /* On these chip versions ASPM can even harm * bus communication of other PCI devices. */ @@ -5165,6 +5167,16 @@ done: rtl_rar_set(tp, mac_addr); } +/* register is set if system vendor successfully tested ASPM 1.2 */ +static bool rtl_aspm_is_safe(struct rtl8169_private *tp) +{ + if (tp->mac_version >= RTL_GIGA_MAC_VER_61 && + r8168_mac_ocp_read(tp, 0xc0b2) & 0xf) + return true; + + return false; +} + static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) { struct rtl8169_private *tp; @@ -5234,6 +5246,19 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) xid); tp->mac_version = chipset; + /* Disable ASPM L1 as that cause random device stop working + * problems as well as full system hangs for some PCIe devices users. + * Chips from RTL8168h partially have issues with L1.2, but seem + * to work fine with L1 and L1.1. + */ + if (rtl_aspm_is_safe(tp)) + rc = 0; + else if (tp->mac_version >= RTL_GIGA_MAC_VER_46) + rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1_2); + else + rc = pci_disable_link_state(pdev, PCIE_LINK_STATE_L1); + tp->aspm_manageable = !rc; + tp->dash_type = rtl_check_dash(tp); tp->cp_cmd = RTL_R16(tp, CPlusCmd) & CPCMD_MASK; -- cgit v1.2.3 From e31a9fedc7d8d80722b19628e66fcb5a36981780 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 18 Jul 2023 13:12:32 +0200 Subject: Revert "r8169: disable ASPM during NAPI poll" This reverts commit e1ed3e4d91112027b90c7ee61479141b3f948e6a. Turned out the change causes a performance regression. Link: https://lore.kernel.org/netdev/20230713124914.GA12924@green245/T/ Cc: stable@vger.kernel.org Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/055c6bc2-74fa-8c67-9897-3f658abb5ae7@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 8a8b7d8a5c3f..5eb50b265c0b 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4523,10 +4523,6 @@ static irqreturn_t rtl8169_interrupt(int irq, void *dev_instance) } if (napi_schedule_prep(&tp->napi)) { - rtl_unlock_config_regs(tp); - rtl_hw_aspm_clkreq_enable(tp, false); - rtl_lock_config_regs(tp); - rtl_irq_disable(tp); __napi_schedule(&tp->napi); } @@ -4586,14 +4582,9 @@ static int rtl8169_poll(struct napi_struct *napi, int budget) work_done = rtl_rx(dev, tp, budget); - if (work_done < budget && napi_complete_done(napi, work_done)) { + if (work_done < budget && napi_complete_done(napi, work_done)) rtl_irq_enable(tp); - rtl_unlock_config_regs(tp); - rtl_hw_aspm_clkreq_enable(tp, true); - rtl_lock_config_regs(tp); - } - return work_done; } -- cgit v1.2.3 From 9f9d4c1a2e82174a4e799ec405284a2b0de32b6a Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Wed, 19 Jul 2023 01:39:36 +0100 Subject: net: ethernet: mtk_eth_soc: always mtk_get_ib1_pkt_type entries and bind debugfs files would display wrong data on NETSYS_V2 and later because instead of using mtk_get_ib1_pkt_type the driver would use MTK_FOE_IB1_PACKET_TYPE which corresponds to NETSYS_V1(.x) SoCs. Use mtk_get_ib1_pkt_type so entries and bind records display correctly. Fixes: 03a3180e5c09e ("net: ethernet: mtk_eth_soc: introduce flow offloading support for mt7986") Signed-off-by: Daniel Golle Acked-by: Lorenzo Bianconi Link: https://lore.kernel.org/r/c0ae03d0182f4d27b874cbdf0059bc972c317f3c.1689727134.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c b/drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c index 316fe2e70fea..1a97feca77f2 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_debugfs.c @@ -98,7 +98,7 @@ mtk_ppe_debugfs_foe_show(struct seq_file *m, void *private, bool bind) acct = mtk_foe_entry_get_mib(ppe, i, NULL); - type = FIELD_GET(MTK_FOE_IB1_PACKET_TYPE, entry->ib1); + type = mtk_get_ib1_pkt_type(ppe->eth, entry->ib1); seq_printf(m, "%05x %s %7s", i, mtk_foe_entry_state_str(state), mtk_foe_pkt_type_str(type)); -- cgit v1.2.3 From 9b64e93e83c2145a750e780198b41d612e3dfa5d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 18 Jul 2023 10:41:49 -0700 Subject: llc: Check netns in llc_dgram_match(). We will remove this restriction in llc_rcv() soon, which means that the protocol handler must be aware of netns. if (!net_eq(dev_net(dev), &init_net)) goto drop; llc_rcv() fetches llc_type_handlers[llc_pdu_type(skb) - 1] and calls it if not NULL. If the PDU type is LLC_DEST_SAP, llc_sap_handler() is called to pass skb to corresponding sockets. Then, we must look up a proper socket in the same netns with skb->dev. If the destination is a multicast address, llc_sap_handler() calls llc_sap_mcast(). It calculates a hash based on DSAP and skb->dev->ifindex, iterates on a socket list, and calls llc_mcast_match() to check if the socket is the correct destination. Then, llc_mcast_match() checks if skb->dev matches with llc_sk(sk)->dev. So, we need not check netns here. OTOH, if the destination is a unicast address, llc_sap_handler() calls llc_lookup_dgram() to look up a socket, but it does not check the netns. Therefore, we need to add netns check in llc_lookup_dgram(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/llc/llc_sap.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/net/llc/llc_sap.c b/net/llc/llc_sap.c index 6805ce43a055..116c0e479183 100644 --- a/net/llc/llc_sap.c +++ b/net/llc/llc_sap.c @@ -294,25 +294,29 @@ static void llc_sap_rcv(struct llc_sap *sap, struct sk_buff *skb, static inline bool llc_dgram_match(const struct llc_sap *sap, const struct llc_addr *laddr, - const struct sock *sk) + const struct sock *sk, + const struct net *net) { struct llc_sock *llc = llc_sk(sk); return sk->sk_type == SOCK_DGRAM && - llc->laddr.lsap == laddr->lsap && - ether_addr_equal(llc->laddr.mac, laddr->mac); + net_eq(sock_net(sk), net) && + llc->laddr.lsap == laddr->lsap && + ether_addr_equal(llc->laddr.mac, laddr->mac); } /** * llc_lookup_dgram - Finds dgram socket for the local sap/mac * @sap: SAP * @laddr: address of local LLC (MAC + SAP) + * @net: netns to look up a socket in * * Search socket list of the SAP and finds connection using the local * mac, and local sap. Returns pointer for socket found, %NULL otherwise. */ static struct sock *llc_lookup_dgram(struct llc_sap *sap, - const struct llc_addr *laddr) + const struct llc_addr *laddr, + const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; @@ -322,12 +326,12 @@ static struct sock *llc_lookup_dgram(struct llc_sap *sap, rcu_read_lock_bh(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { - if (llc_dgram_match(sap, laddr, rc)) { + if (llc_dgram_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || - !llc_dgram_match(sap, laddr, rc))) { + !llc_dgram_match(sap, laddr, rc, net))) { sock_put(rc); continue; } @@ -429,7 +433,7 @@ void llc_sap_handler(struct llc_sap *sap, struct sk_buff *skb) llc_sap_mcast(sap, &laddr, skb); kfree_skb(skb); } else { - struct sock *sk = llc_lookup_dgram(sap, &laddr); + struct sock *sk = llc_lookup_dgram(sap, &laddr, dev_net(skb->dev)); if (sk) { llc_sap_rcv(sap, skb, sk); sock_put(sk); -- cgit v1.2.3 From 97b1d320f48c21e40cc42b4ac033f2520f9ecc5c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 18 Jul 2023 10:41:50 -0700 Subject: llc: Check netns in llc_estab_match() and llc_listener_match(). We will remove this restriction in llc_rcv() in the following patch, which means that the protocol handler must be aware of netns. if (!net_eq(dev_net(dev), &init_net)) goto drop; llc_rcv() fetches llc_type_handlers[llc_pdu_type(skb) - 1] and calls it if not NULL. If the PDU type is LLC_DEST_CONN, llc_conn_handler() is called to pass skb to corresponding sockets. Then, we must look up a proper socket in the same netns with skb->dev. llc_conn_handler() calls __llc_lookup() to look up a established or litening socket by __llc_lookup_established() and llc_lookup_listener(). Both functions iterate on a list and call llc_estab_match() or llc_listener_match() to check if the socket is the correct destination. However, these functions do not check netns. Also, bind() and connect() call llc_establish_connection(), which finally calls __llc_lookup_established(), to check if there is a conflicting socket. Let's test netns in llc_estab_match() and llc_listener_match(). Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- include/net/llc_conn.h | 2 +- net/llc/af_llc.c | 2 +- net/llc/llc_conn.c | 49 ++++++++++++++++++++++++++++++------------------- net/llc/llc_if.c | 2 +- 4 files changed, 33 insertions(+), 22 deletions(-) diff --git a/include/net/llc_conn.h b/include/net/llc_conn.h index 2c1ea3414640..374411b3066c 100644 --- a/include/net/llc_conn.h +++ b/include/net/llc_conn.h @@ -111,7 +111,7 @@ void llc_conn_resend_i_pdu_as_cmd(struct sock *sk, u8 nr, u8 first_p_bit); void llc_conn_resend_i_pdu_as_rsp(struct sock *sk, u8 nr, u8 first_f_bit); int llc_conn_remove_acked_pdus(struct sock *conn, u8 nr, u16 *how_many_unacked); struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr); + struct llc_addr *laddr, const struct net *net); void llc_sap_add_socket(struct llc_sap *sap, struct sock *sk); void llc_sap_remove_socket(struct llc_sap *sap, struct sock *sk); diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 57c35c960b2c..9b06c380866b 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -402,7 +402,7 @@ static int llc_ui_bind(struct socket *sock, struct sockaddr *uaddr, int addrlen) memcpy(laddr.mac, addr->sllc_mac, IFHWADDRLEN); laddr.lsap = addr->sllc_sap; rc = -EADDRINUSE; /* mac + sap clash. */ - ask = llc_lookup_established(sap, &daddr, &laddr); + ask = llc_lookup_established(sap, &daddr, &laddr, &init_net); if (ask) { sock_put(ask); goto out_put; diff --git a/net/llc/llc_conn.c b/net/llc/llc_conn.c index 912aa9bd5e29..d037009ee10f 100644 --- a/net/llc/llc_conn.c +++ b/net/llc/llc_conn.c @@ -453,11 +453,13 @@ static int llc_exec_conn_trans_actions(struct sock *sk, static inline bool llc_estab_match(const struct llc_sap *sap, const struct llc_addr *daddr, const struct llc_addr *laddr, - const struct sock *sk) + const struct sock *sk, + const struct net *net) { struct llc_sock *llc = llc_sk(sk); - return llc->laddr.lsap == laddr->lsap && + return net_eq(sock_net(sk), net) && + llc->laddr.lsap == laddr->lsap && llc->daddr.lsap == daddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac) && ether_addr_equal(llc->daddr.mac, daddr->mac); @@ -468,6 +470,7 @@ static inline bool llc_estab_match(const struct llc_sap *sap, * @sap: SAP * @daddr: address of remote LLC (MAC + SAP) * @laddr: address of local LLC (MAC + SAP) + * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection using the remote * mac, remote sap, local mac, and local sap. Returns pointer for @@ -476,7 +479,8 @@ static inline bool llc_estab_match(const struct llc_sap *sap, */ static struct sock *__llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; @@ -486,12 +490,12 @@ static struct sock *__llc_lookup_established(struct llc_sap *sap, rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { - if (llc_estab_match(sap, daddr, laddr, rc)) { + if (llc_estab_match(sap, daddr, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || - !llc_estab_match(sap, daddr, laddr, rc))) { + !llc_estab_match(sap, daddr, laddr, rc, net))) { sock_put(rc); continue; } @@ -513,29 +517,33 @@ found: struct sock *llc_lookup_established(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { struct sock *sk; local_bh_disable(); - sk = __llc_lookup_established(sap, daddr, laddr); + sk = __llc_lookup_established(sap, daddr, laddr, net); local_bh_enable(); return sk; } static inline bool llc_listener_match(const struct llc_sap *sap, const struct llc_addr *laddr, - const struct sock *sk) + const struct sock *sk, + const struct net *net) { struct llc_sock *llc = llc_sk(sk); - return sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && + return net_eq(sock_net(sk), net) && + sk->sk_type == SOCK_STREAM && sk->sk_state == TCP_LISTEN && llc->laddr.lsap == laddr->lsap && ether_addr_equal(llc->laddr.mac, laddr->mac); } static struct sock *__llc_lookup_listener(struct llc_sap *sap, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { struct sock *rc; struct hlist_nulls_node *node; @@ -545,12 +553,12 @@ static struct sock *__llc_lookup_listener(struct llc_sap *sap, rcu_read_lock(); again: sk_nulls_for_each_rcu(rc, node, laddr_hb) { - if (llc_listener_match(sap, laddr, rc)) { + if (llc_listener_match(sap, laddr, rc, net)) { /* Extra checks required by SLAB_TYPESAFE_BY_RCU */ if (unlikely(!refcount_inc_not_zero(&rc->sk_refcnt))) goto again; if (unlikely(llc_sk(rc)->sap != sap || - !llc_listener_match(sap, laddr, rc))) { + !llc_listener_match(sap, laddr, rc, net))) { sock_put(rc); continue; } @@ -574,6 +582,7 @@ found: * llc_lookup_listener - Finds listener for local MAC + SAP * @sap: SAP * @laddr: address of local LLC (MAC + SAP) + * @net: netns to look up a socket in * * Search connection list of the SAP and finds connection listening on * local mac, and local sap. Returns pointer for parent socket found, @@ -581,24 +590,26 @@ found: * Caller has to make sure local_bh is disabled. */ static struct sock *llc_lookup_listener(struct llc_sap *sap, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { + struct sock *rc = __llc_lookup_listener(sap, laddr, net); static struct llc_addr null_addr; - struct sock *rc = __llc_lookup_listener(sap, laddr); if (!rc) - rc = __llc_lookup_listener(sap, &null_addr); + rc = __llc_lookup_listener(sap, &null_addr, net); return rc; } static struct sock *__llc_lookup(struct llc_sap *sap, struct llc_addr *daddr, - struct llc_addr *laddr) + struct llc_addr *laddr, + const struct net *net) { - struct sock *sk = __llc_lookup_established(sap, daddr, laddr); + struct sock *sk = __llc_lookup_established(sap, daddr, laddr, net); - return sk ? : llc_lookup_listener(sap, laddr); + return sk ? : llc_lookup_listener(sap, laddr, net); } /** @@ -776,7 +787,7 @@ void llc_conn_handler(struct llc_sap *sap, struct sk_buff *skb) llc_pdu_decode_da(skb, daddr.mac); llc_pdu_decode_dsap(skb, &daddr.lsap); - sk = __llc_lookup(sap, &saddr, &daddr); + sk = __llc_lookup(sap, &saddr, &daddr, dev_net(skb->dev)); if (!sk) goto drop; diff --git a/net/llc/llc_if.c b/net/llc/llc_if.c index dde9bf08a593..58a5f419adc6 100644 --- a/net/llc/llc_if.c +++ b/net/llc/llc_if.c @@ -92,7 +92,7 @@ int llc_establish_connection(struct sock *sk, const u8 *lmac, u8 *dmac, u8 dsap) daddr.lsap = dsap; memcpy(daddr.mac, dmac, sizeof(daddr.mac)); memcpy(laddr.mac, lmac, sizeof(laddr.mac)); - existing = llc_lookup_established(llc->sap, &daddr, &laddr); + existing = llc_lookup_established(llc->sap, &daddr, &laddr, sock_net(sk)); if (existing) { if (existing->sk_state == TCP_ESTABLISHED) { sk = existing; -- cgit v1.2.3 From 6631463b6e6673916d2481f692938f393148aa82 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 18 Jul 2023 10:41:51 -0700 Subject: llc: Don't drop packet from non-root netns. Now these upper layer protocol handlers can be called from llc_rcv() as sap->rcv_func(), which is registered by llc_sap_open(). * function which is passed to register_8022_client() -> no in-kernel user calls register_8022_client(). * snap_rcv() `- proto->rcvfunc() : registered by register_snap_client() -> aarp_rcv() and atalk_rcv() drop packets from non-root netns * stp_pdu_rcv() `- garp_protos[]->rcv() : registered by stp_proto_register() -> garp_pdu_rcv() and br_stp_rcv() are netns-aware So, we can safely remove the netns restriction in llc_rcv(). Fixes: e730c15519d0 ("[NET]: Make packet reception network namespace safe") Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/llc/llc_input.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c index c309b72a5877..7cac441862e2 100644 --- a/net/llc/llc_input.c +++ b/net/llc/llc_input.c @@ -163,9 +163,6 @@ int llc_rcv(struct sk_buff *skb, struct net_device *dev, void (*sta_handler)(struct sk_buff *skb); void (*sap_handler)(struct llc_sap *sap, struct sk_buff *skb); - if (!net_eq(dev_net(dev), &init_net)) - goto drop; - /* * When the interface is in promisc. mode, drop all the crap that it * receives, do not try to analyse it. -- cgit v1.2.3 From 7ebd00a5a20c48e6020d49a3b2afb3cdfd2da8b7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 18 Jul 2023 10:41:52 -0700 Subject: Revert "bridge: Add extack warning when enabling STP in netns." This reverts commit 56a16035bb6effb37177867cea94c13a8382f745. Since the previous commit, STP works on bridge in netns. # unshare -n # ip link add br0 type bridge # ip link add veth0 type veth peer name veth1 # ip link set veth0 master br0 up [ 50.558135] br0: port 1(veth0) entered blocking state [ 50.558366] br0: port 1(veth0) entered disabled state [ 50.558798] veth0: entered allmulticast mode [ 50.564401] veth0: entered promiscuous mode # ip link set veth1 master br0 up [ 54.215487] br0: port 2(veth1) entered blocking state [ 54.215657] br0: port 2(veth1) entered disabled state [ 54.215848] veth1: entered allmulticast mode [ 54.219577] veth1: entered promiscuous mode # ip link set br0 type bridge stp_state 1 # ip link set br0 up [ 61.960726] br0: port 2(veth1) entered blocking state [ 61.961097] br0: port 2(veth1) entered listening state [ 61.961495] br0: port 1(veth0) entered blocking state [ 61.961653] br0: port 1(veth0) entered listening state [ 63.998835] br0: port 2(veth1) entered blocking state [ 77.437113] br0: port 1(veth0) entered learning state [ 86.653501] br0: received packet on veth0 with own address as source address (addr:6e:0f:e7:6f:5f:5f, vlan:0) [ 92.797095] br0: port 1(veth0) entered forwarding state [ 92.797398] br0: topology change detected, propagating Let's remove the warning. Signed-off-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni --- net/bridge/br_stp_if.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index b65962682771..75204d36d7f9 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -201,9 +201,6 @@ int br_stp_set_enabled(struct net_bridge *br, unsigned long val, { ASSERT_RTNL(); - if (!net_eq(dev_net(br->dev), &init_net)) - NL_SET_ERR_MSG_MOD(extack, "STP does not work in non-root netns"); - if (br_mrp_enabled(br)) { NL_SET_ERR_MSG_MOD(extack, "STP can't be enabled if MRP is already enabled"); -- cgit v1.2.3 From ddbd8be68941985f166f5107109a90ce13147c44 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Jul 2023 00:29:58 +0200 Subject: netfilter: nf_tables: fix spurious set element insertion failure On some platforms there is a padding hole in the nft_verdict structure, between the verdict code and the chain pointer. On element insertion, if the new element clashes with an existing one and NLM_F_EXCL flag isn't set, we want to ignore the -EEXIST error as long as the data associated with duplicated element is the same as the existing one. The data equality check uses memcmp. For normal data (NFT_DATA_VALUE) this works fine, but for NFT_DATA_VERDICT padding area leads to spurious failure even if the verdict data is the same. This then makes the insertion fail with 'already exists' error, even though the new "key : data" matches an existing entry and userspace told the kernel that it doesn't want to receive an error indication. Fixes: c016c7e45ddf ("netfilter: nf_tables: honor NLM_F_EXCL flag in set element insertion") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 237f739da3ca..79c7eee33dcd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10517,6 +10517,9 @@ static int nft_verdict_init(const struct nft_ctx *ctx, struct nft_data *data, if (!tb[NFTA_VERDICT_CODE]) return -EINVAL; + + /* zero padding hole for memcmp */ + memset(data, 0, sizeof(*data)); data->verdict.code = ntohl(nla_get_be32(tb[NFTA_VERDICT_CODE])); switch (data->verdict.code) { -- cgit v1.2.3 From 314c82841602a111c04a7210c21dc77e0d560242 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 18 Jul 2023 01:30:33 +0200 Subject: netfilter: nf_tables: can't schedule in nft_chain_validate Can be called via nft set element list iteration, which may acquire rcu and/or bh read lock (depends on set type). BUG: sleeping function called from invalid context at net/netfilter/nf_tables_api.c:3353 in_atomic(): 0, irqs_disabled(): 0, non_block: 0, pid: 1232, name: nft preempt_count: 0, expected: 0 RCU nest depth: 1, expected: 0 2 locks held by nft/1232: #0: ffff8881180e3ea8 (&nft_net->commit_mutex){+.+.}-{3:3}, at: nf_tables_valid_genid #1: ffffffff83f5f540 (rcu_read_lock){....}-{1:2}, at: rcu_lock_acquire Call Trace: nft_chain_validate nft_lookup_validate_setelem nft_pipapo_walk nft_lookup_validate nft_chain_validate nft_immediate_validate nft_chain_validate nf_tables_validate nf_tables_abort No choice but to move it to nf_tables_validate(). Fixes: 81ea01066741 ("netfilter: nf_tables: add rescheduling points during loop detection walks") Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 79c7eee33dcd..41e7d21d4429 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3685,8 +3685,6 @@ int nft_chain_validate(const struct nft_ctx *ctx, const struct nft_chain *chain) if (err < 0) return err; } - - cond_resched(); } return 0; @@ -3710,6 +3708,8 @@ static int nft_table_validate(struct net *net, const struct nft_table *table) err = nft_chain_validate(&ctx, chain); if (err < 0) return err; + + cond_resched(); } return 0; -- cgit v1.2.3 From 87b5a5c209405cb6b57424cdfa226a6dbd349232 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Wed, 19 Jul 2023 21:08:21 +0200 Subject: netfilter: nft_set_pipapo: fix improper element removal end key should be equal to start unless NFT_SET_EXT_KEY_END is present. Its possible to add elements that only have a start key ("{ 1.0.0.0 . 2.0.0.0 }") without an internval end. Insertion treats this via: if (nft_set_ext_exists(ext, NFT_SET_EXT_KEY_END)) end = (const u8 *)nft_set_ext_key_end(ext)->data; else end = start; but removal side always uses nft_set_ext_key_end(). This is wrong and leads to garbage remaining in the set after removal next lookup/insert attempt will give: BUG: KASAN: slab-use-after-free in pipapo_get+0x8eb/0xb90 Read of size 1 at addr ffff888100d50586 by task nft-pipapo_uaf_/1399 Call Trace: kasan_report+0x105/0x140 pipapo_get+0x8eb/0xb90 nft_pipapo_insert+0x1dc/0x1710 nf_tables_newsetelem+0x31f5/0x4e00 .. Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") Reported-by: lonial con Reviewed-by: Stefano Brivio Signed-off-by: Florian Westphal --- net/netfilter/nft_set_pipapo.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c index db526cb7a485..49915a2a58eb 100644 --- a/net/netfilter/nft_set_pipapo.c +++ b/net/netfilter/nft_set_pipapo.c @@ -1929,7 +1929,11 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, int i, start, rules_fx; match_start = data; - match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + + if (nft_set_ext_exists(&e->ext, NFT_SET_EXT_KEY_END)) + match_end = (const u8 *)nft_set_ext_key_end(&e->ext)->data; + else + match_end = data; start = first_rule; rules_fx = rules_f0; -- cgit v1.2.3 From 751d460ccff3137212f47d876221534bf0490996 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 19 Jul 2023 20:19:43 +0200 Subject: netfilter: nf_tables: skip bound chain in netns release path Skip bound chain from netns release path, the rule that owns this chain releases these objects. Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 41e7d21d4429..40bff6e2ea5d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10802,6 +10802,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) ctx.family = table->family; ctx.table = table; list_for_each_entry(chain, &table->chains, list) { + if (nft_chain_is_bound(chain)) + continue; + ctx.chain = chain; list_for_each_entry_safe(rule, nr, &chain->rules, list) { list_del(&rule->list); -- cgit v1.2.3 From 6eaf41e87a223ae6f8e7a28d6e78384ad7e407f8 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 20 Jul 2023 09:17:21 +0200 Subject: netfilter: nf_tables: skip bound chain on rule flush Skip bound chain when flushing table rules, the rule that owns this chain releases these objects. Otherwise, the following warning is triggered: WARNING: CPU: 2 PID: 1217 at net/netfilter/nf_tables_api.c:2013 nf_tables_chain_destroy+0x1f7/0x210 [nf_tables] CPU: 2 PID: 1217 Comm: chain-flush Not tainted 6.1.39 #1 RIP: 0010:nf_tables_chain_destroy+0x1f7/0x210 [nf_tables] Fixes: d0e2c7de92c7 ("netfilter: nf_tables: add NFT_CHAIN_BINDING") Reported-by: Kevin Rich Signed-off-by: Pablo Neira Ayuso Signed-off-by: Florian Westphal --- net/netfilter/nf_tables_api.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 40bff6e2ea5d..b9a4d3fd1d34 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -4087,6 +4087,8 @@ static int nf_tables_delrule(struct sk_buff *skb, const struct nfnl_info *info, list_for_each_entry(chain, &table->chains, list) { if (!nft_is_active_next(net, chain)) continue; + if (nft_chain_is_bound(chain)) + continue; ctx.chain = chain; err = nft_delrule_by_chain(&ctx); -- cgit v1.2.3 From 195ef75e19287b4bc413da3e3e3722b030ac881e Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 19 Jun 2023 01:04:31 +0300 Subject: Bluetooth: use RCU for hci_conn_params and iterate safely in hci_sync hci_update_accept_list_sync iterates over hdev->pend_le_conns and hdev->pend_le_reports, and waits for controller events in the loop body, without holding hdev lock. Meanwhile, these lists and the items may be modified e.g. by le_scan_cleanup. This can invalidate the list cursor or any other item in the list, resulting to invalid behavior (eg use-after-free). Use RCU for the hci_conn_params action lists. Since the loop bodies in hci_sync block and we cannot use RCU or hdev->lock for the whole loop, copy list items first and then iterate on the copy. Only the flags field is written from elsewhere, so READ_ONCE/WRITE_ONCE should guarantee we read valid values. Free params everywhere with hci_conn_params_free so the cleanup is guaranteed to be done properly. This fixes the following, which can be triggered e.g. by BlueZ new mgmt-tester case "Add + Remove Device Nowait - Success", or by changing hci_le_set_cig_params to always return false, and running iso-tester: ================================================================== BUG: KASAN: slab-use-after-free in hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) Read of size 8 at addr ffff888001265018 by task kworker/u3:0/32 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 Workqueue: hci0 hci_cmd_sync_work Call Trace: dump_stack_lvl (./arch/x86/include/asm/irqflags.h:134 lib/dump_stack.c:107) print_report (mm/kasan/report.c:320 mm/kasan/report.c:430) ? __virt_addr_valid (./include/linux/mmzone.h:1915 ./include/linux/mmzone.h:2011 arch/x86/mm/physaddr.c:65) ? hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) kasan_report (mm/kasan/report.c:538) ? hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2536 net/bluetooth/hci_sync.c:2723 net/bluetooth/hci_sync.c:2841) ? __pfx_hci_update_passive_scan_sync (net/bluetooth/hci_sync.c:2780) ? mutex_lock (kernel/locking/mutex.c:282) ? __pfx_mutex_lock (kernel/locking/mutex.c:282) ? __pfx_mutex_unlock (kernel/locking/mutex.c:538) ? __pfx_update_passive_scan_sync (net/bluetooth/hci_sync.c:2861) hci_cmd_sync_work (net/bluetooth/hci_sync.c:306) process_one_work (./arch/x86/include/asm/preempt.h:27 kernel/workqueue.c:2399) worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2538) ? __pfx_worker_thread (kernel/workqueue.c:2480) kthread (kernel/kthread.c:376) ? __pfx_kthread (kernel/kthread.c:331) ret_from_fork (arch/x86/entry/entry_64.S:314) Allocated by task 31: kasan_save_stack (mm/kasan/common.c:46) kasan_set_track (mm/kasan/common.c:52) __kasan_kmalloc (mm/kasan/common.c:374 mm/kasan/common.c:383) hci_conn_params_add (./include/linux/slab.h:580 ./include/linux/slab.h:720 net/bluetooth/hci_core.c:2277) hci_connect_le_scan (net/bluetooth/hci_conn.c:1419 net/bluetooth/hci_conn.c:1589) hci_connect_cis (net/bluetooth/hci_conn.c:2266) iso_connect_cis (net/bluetooth/iso.c:390) iso_sock_connect (net/bluetooth/iso.c:899) __sys_connect (net/socket.c:2003 net/socket.c:2020) __x64_sys_connect (net/socket.c:2027) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) Freed by task 15: kasan_save_stack (mm/kasan/common.c:46) kasan_set_track (mm/kasan/common.c:52) kasan_save_free_info (mm/kasan/generic.c:523) __kasan_slab_free (mm/kasan/common.c:238 mm/kasan/common.c:200 mm/kasan/common.c:244) __kmem_cache_free (mm/slub.c:1807 mm/slub.c:3787 mm/slub.c:3800) hci_conn_params_del (net/bluetooth/hci_core.c:2323) le_scan_cleanup (net/bluetooth/hci_conn.c:202) process_one_work (./arch/x86/include/asm/preempt.h:27 kernel/workqueue.c:2399) worker_thread (./include/linux/list.h:292 kernel/workqueue.c:2538) kthread (kernel/kthread.c:376) ret_from_fork (arch/x86/entry/entry_64.S:314) ================================================================== Fixes: e8907f76544f ("Bluetooth: hci_sync: Make use of hci_cmd_sync_queue set 3") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 5 ++ net/bluetooth/hci_conn.c | 10 ++-- net/bluetooth/hci_core.c | 38 ++++++++++--- net/bluetooth/hci_event.c | 12 ++-- net/bluetooth/hci_sync.c | 117 +++++++++++++++++++++++++++++++++++---- net/bluetooth/mgmt.c | 26 ++++----- 6 files changed, 164 insertions(+), 44 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 9654567cfae3..870b6d3c5146 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -822,6 +822,7 @@ struct hci_conn_params { struct hci_conn *conn; bool explicit_connect; + /* Accessed without hdev->lock: */ hci_conn_flags_t flags; u8 privacy_mode; }; @@ -1573,7 +1574,11 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_del(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type); void hci_conn_params_clear_disabled(struct hci_dev *hdev); +void hci_conn_params_free(struct hci_conn_params *param); +void hci_pend_le_list_del_init(struct hci_conn_params *param); +void hci_pend_le_list_add(struct hci_conn_params *param, + struct list_head *list); struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 056f9516e46d..db598942cab4 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -118,7 +118,7 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) */ params->explicit_connect = false; - list_del_init(¶ms->action); + hci_pend_le_list_del_init(params); switch (params->auto_connect) { case HCI_AUTO_CONN_EXPLICIT: @@ -127,10 +127,10 @@ static void hci_connect_le_scan_cleanup(struct hci_conn *conn, u8 status) return; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: - list_add(¶ms->action, &hdev->pend_le_reports); + hci_pend_le_list_add(params, &hdev->pend_le_reports); break; default: break; @@ -1426,8 +1426,8 @@ static int hci_explicit_conn_params_set(struct hci_dev *hdev, if (params->auto_connect == HCI_AUTO_CONN_DISABLED || params->auto_connect == HCI_AUTO_CONN_REPORT || params->auto_connect == HCI_AUTO_CONN_EXPLICIT) { - list_del_init(¶ms->action); - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, &hdev->pend_le_conns); } params->explicit_connect = true; diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 48917c68358d..b421e196f60c 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -2249,21 +2249,45 @@ struct hci_conn_params *hci_conn_params_lookup(struct hci_dev *hdev, return NULL; } -/* This function requires the caller holds hdev->lock */ +/* This function requires the caller holds hdev->lock or rcu_read_lock */ struct hci_conn_params *hci_pend_le_action_lookup(struct list_head *list, bdaddr_t *addr, u8 addr_type) { struct hci_conn_params *param; - list_for_each_entry(param, list, action) { + rcu_read_lock(); + + list_for_each_entry_rcu(param, list, action) { if (bacmp(¶m->addr, addr) == 0 && - param->addr_type == addr_type) + param->addr_type == addr_type) { + rcu_read_unlock(); return param; + } } + rcu_read_unlock(); + return NULL; } +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_list_del_init(struct hci_conn_params *param) +{ + if (list_empty(¶m->action)) + return; + + list_del_rcu(¶m->action); + synchronize_rcu(); + INIT_LIST_HEAD(¶m->action); +} + +/* This function requires the caller holds hdev->lock */ +void hci_pend_le_list_add(struct hci_conn_params *param, + struct list_head *list) +{ + list_add_rcu(¶m->action, list); +} + /* This function requires the caller holds hdev->lock */ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, bdaddr_t *addr, u8 addr_type) @@ -2297,14 +2321,15 @@ struct hci_conn_params *hci_conn_params_add(struct hci_dev *hdev, return params; } -static void hci_conn_params_free(struct hci_conn_params *params) +void hci_conn_params_free(struct hci_conn_params *params) { + hci_pend_le_list_del_init(params); + if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); } - list_del(¶ms->action); list_del(¶ms->list); kfree(params); } @@ -2342,8 +2367,7 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev) continue; } - list_del(¶ms->list); - kfree(params); + hci_conn_params_free(params); } BT_DBG("All LE disabled connection parameters were removed"); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 95816a938cea..fbc5cb8548f2 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1564,7 +1564,7 @@ static u8 hci_cc_le_set_privacy_mode(struct hci_dev *hdev, void *data, params = hci_conn_params_lookup(hdev, &cp->bdaddr, cp->bdaddr_type); if (params) - params->privacy_mode = cp->mode; + WRITE_ONCE(params->privacy_mode, cp->mode); hci_dev_unlock(hdev); @@ -2804,8 +2804,8 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_del_init(¶ms->action); - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; default: @@ -3423,8 +3423,8 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, void *data, case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_del_init(¶ms->action); - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_del_init(params); + hci_pend_le_list_add(params, &hdev->pend_le_conns); hci_update_passive_scan(hdev); break; @@ -5962,7 +5962,7 @@ static void le_conn_complete_evt(struct hci_dev *hdev, u8 status, params = hci_pend_le_action_lookup(&hdev->pend_le_conns, &conn->dst, conn->dst_type); if (params) { - list_del_init(¶ms->action); + hci_pend_le_list_del_init(params); if (params->conn) { hci_conn_drop(params->conn); hci_conn_put(params->conn); diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 8561616abbe5..4d1e32bb6a9c 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -2160,15 +2160,23 @@ static int hci_le_del_accept_list_sync(struct hci_dev *hdev, return 0; } +struct conn_params { + bdaddr_t addr; + u8 addr_type; + hci_conn_flags_t flags; + u8 privacy_mode; +}; + /* Adds connection to resolve list if needed. * Setting params to NULL programs local hdev->irk */ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, - struct hci_conn_params *params) + struct conn_params *params) { struct hci_cp_le_add_to_resolv_list cp; struct smp_irk *irk; struct bdaddr_list_with_irk *entry; + struct hci_conn_params *p; if (!use_ll_privacy(hdev)) return 0; @@ -2203,6 +2211,16 @@ static int hci_le_add_resolve_list_sync(struct hci_dev *hdev, /* Default privacy mode is always Network */ params->privacy_mode = HCI_NETWORK_PRIVACY; + rcu_read_lock(); + p = hci_pend_le_action_lookup(&hdev->pend_le_conns, + ¶ms->addr, params->addr_type); + if (!p) + p = hci_pend_le_action_lookup(&hdev->pend_le_reports, + ¶ms->addr, params->addr_type); + if (p) + WRITE_ONCE(p->privacy_mode, HCI_NETWORK_PRIVACY); + rcu_read_unlock(); + done: if (hci_dev_test_flag(hdev, HCI_PRIVACY)) memcpy(cp.local_irk, hdev->irk, 16); @@ -2215,7 +2233,7 @@ done: /* Set Device Privacy Mode. */ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, - struct hci_conn_params *params) + struct conn_params *params) { struct hci_cp_le_set_privacy_mode cp; struct smp_irk *irk; @@ -2240,6 +2258,8 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, bacpy(&cp.bdaddr, &irk->bdaddr); cp.mode = HCI_DEVICE_PRIVACY; + /* Note: params->privacy_mode is not updated since it is a copy */ + return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_PRIVACY_MODE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } @@ -2249,7 +2269,7 @@ static int hci_le_set_privacy_mode_sync(struct hci_dev *hdev, * properly set the privacy mode. */ static int hci_le_add_accept_list_sync(struct hci_dev *hdev, - struct hci_conn_params *params, + struct conn_params *params, u8 *num_entries) { struct hci_cp_le_add_to_accept_list cp; @@ -2447,6 +2467,52 @@ struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, return __hci_cmd_sync_sk(hdev, opcode, 0, NULL, 0, HCI_CMD_TIMEOUT, sk); } +static struct conn_params *conn_params_copy(struct list_head *list, size_t *n) +{ + struct hci_conn_params *params; + struct conn_params *p; + size_t i; + + rcu_read_lock(); + + i = 0; + list_for_each_entry_rcu(params, list, action) + ++i; + *n = i; + + rcu_read_unlock(); + + p = kvcalloc(*n, sizeof(struct conn_params), GFP_KERNEL); + if (!p) + return NULL; + + rcu_read_lock(); + + i = 0; + list_for_each_entry_rcu(params, list, action) { + /* Racing adds are handled in next scan update */ + if (i >= *n) + break; + + /* No hdev->lock, but: addr, addr_type are immutable. + * privacy_mode is only written by us or in + * hci_cc_le_set_privacy_mode that we wait for. + * We should be idempotent so MGMT updating flags + * while we are processing is OK. + */ + bacpy(&p[i].addr, ¶ms->addr); + p[i].addr_type = params->addr_type; + p[i].flags = READ_ONCE(params->flags); + p[i].privacy_mode = READ_ONCE(params->privacy_mode); + ++i; + } + + rcu_read_unlock(); + + *n = i; + return p; +} + /* Device must not be scanning when updating the accept list. * * Update is done using the following sequence: @@ -2466,11 +2532,12 @@ struct sk_buff *hci_read_local_oob_data_sync(struct hci_dev *hdev, */ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) { - struct hci_conn_params *params; + struct conn_params *params; struct bdaddr_list *b, *t; u8 num_entries = 0; bool pend_conn, pend_report; u8 filter_policy; + size_t i, n; int err; /* Pause advertising if resolving list can be used as controllers @@ -2504,6 +2571,7 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) if (hci_conn_hash_lookup_le(hdev, &b->bdaddr, b->bdaddr_type)) continue; + /* Pointers not dereferenced, no locks needed */ pend_conn = hci_pend_le_action_lookup(&hdev->pend_le_conns, &b->bdaddr, b->bdaddr_type); @@ -2532,23 +2600,50 @@ static u8 hci_update_accept_list_sync(struct hci_dev *hdev) * available accept list entries in the controller, then * just abort and return filer policy value to not use the * accept list. + * + * The list and params may be mutated while we wait for events, + * so make a copy and iterate it. */ - list_for_each_entry(params, &hdev->pend_le_conns, action) { - err = hci_le_add_accept_list_sync(hdev, params, &num_entries); - if (err) + + params = conn_params_copy(&hdev->pend_le_conns, &n); + if (!params) { + err = -ENOMEM; + goto done; + } + + for (i = 0; i < n; ++i) { + err = hci_le_add_accept_list_sync(hdev, ¶ms[i], + &num_entries); + if (err) { + kvfree(params); goto done; + } } + kvfree(params); + /* After adding all new pending connections, walk through * the list of pending reports and also add these to the * accept list if there is still space. Abort if space runs out. */ - list_for_each_entry(params, &hdev->pend_le_reports, action) { - err = hci_le_add_accept_list_sync(hdev, params, &num_entries); - if (err) + + params = conn_params_copy(&hdev->pend_le_reports, &n); + if (!params) { + err = -ENOMEM; + goto done; + } + + for (i = 0; i < n; ++i) { + err = hci_le_add_accept_list_sync(hdev, ¶ms[i], + &num_entries); + if (err) { + kvfree(params); goto done; + } } + kvfree(params); + /* Use the allowlist unless the following conditions are all true: * - We are not currently suspending * - There are 1 or more ADV monitors registered and it's not offloaded @@ -4837,12 +4932,12 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) struct hci_conn_params *p; list_for_each_entry(p, &hdev->le_conn_params, list) { + hci_pend_le_list_del_init(p); if (p->conn) { hci_conn_drop(p->conn); hci_conn_put(p->conn); p->conn = NULL; } - list_del_init(&p->action); } BT_DBG("All LE pending actions cleared"); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index f7b2d0971f24..1e07d0f28972 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -1297,15 +1297,15 @@ static void restart_le_actions(struct hci_dev *hdev) /* Needed for AUTO_OFF case where might not "really" * have been powered off. */ - list_del_init(&p->action); + hci_pend_le_list_del_init(p); switch (p->auto_connect) { case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: - list_add(&p->action, &hdev->pend_le_conns); + hci_pend_le_list_add(p, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: - list_add(&p->action, &hdev->pend_le_reports); + hci_pend_le_list_add(p, &hdev->pend_le_reports); break; default: break; @@ -5169,7 +5169,7 @@ static int set_device_flags(struct sock *sk, struct hci_dev *hdev, void *data, goto unlock; } - params->flags = current_flags; + WRITE_ONCE(params->flags, current_flags); status = MGMT_STATUS_SUCCESS; /* Update passive scan if HCI_CONN_FLAG_DEVICE_PRIVACY @@ -7580,7 +7580,7 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, if (params->auto_connect == auto_connect) return 0; - list_del_init(¶ms->action); + hci_pend_le_list_del_init(params); switch (auto_connect) { case HCI_AUTO_CONN_DISABLED: @@ -7589,18 +7589,18 @@ static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr, * connect to device, keep connecting. */ if (params->explicit_connect) - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; case HCI_AUTO_CONN_REPORT: if (params->explicit_connect) - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); else - list_add(¶ms->action, &hdev->pend_le_reports); + hci_pend_le_list_add(params, &hdev->pend_le_reports); break; case HCI_AUTO_CONN_DIRECT: case HCI_AUTO_CONN_ALWAYS: if (!is_connected(hdev, addr, addr_type)) - list_add(¶ms->action, &hdev->pend_le_conns); + hci_pend_le_list_add(params, &hdev->pend_le_conns); break; } @@ -7823,9 +7823,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, goto unlock; } - list_del(¶ms->action); - list_del(¶ms->list); - kfree(params); + hci_conn_params_free(params); device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type); } else { @@ -7856,9 +7854,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev, p->auto_connect = HCI_AUTO_CONN_EXPLICIT; continue; } - list_del(&p->action); - list_del(&p->list); - kfree(p); + hci_conn_params_free(p); } bt_dev_dbg(hdev, "All LE connection parameters were removed"); -- cgit v1.2.3 From 7f7cfcb6f0825652973b780f248603e23f16ee90 Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 19 Jun 2023 01:04:32 +0300 Subject: Bluetooth: hci_event: call disconnect callback before deleting conn In hci_cs_disconnect, we do hci_conn_del even if disconnection failed. ISO, L2CAP and SCO connections refer to the hci_conn without hci_conn_get, so disconn_cfm must be called so they can clean up their conn, otherwise use-after-free occurs. ISO: ========================================================== iso_sock_connect:880: sk 00000000eabd6557 iso_connect_cis:356: 70:1a:b8:98:ff:a2 -> 28:3d:c2:4a:7e:da ... iso_conn_add:140: hcon 000000001696f1fd conn 00000000b6251073 hci_dev_put:1487: hci0 orig refcnt 17 __iso_chan_add:214: conn 00000000b6251073 iso_sock_clear_timer:117: sock 00000000eabd6557 state 3 ... hci_rx_work:4085: hci0 Event packet hci_event_packet:7601: hci0: event 0x0f hci_cmd_status_evt:4346: hci0: opcode 0x0406 hci_cs_disconnect:2760: hci0: status 0x0c hci_sent_cmd_data:3107: hci0 opcode 0x0406 hci_conn_del:1151: hci0 hcon 000000001696f1fd handle 2560 hci_conn_unlink:1102: hci0: hcon 000000001696f1fd hci_conn_drop:1451: hcon 00000000d8521aaf orig refcnt 2 hci_chan_list_flush:2780: hcon 000000001696f1fd hci_dev_put:1487: hci0 orig refcnt 21 hci_dev_put:1487: hci0 orig refcnt 20 hci_req_cmd_complete:3978: opcode 0x0406 status 0x0c ... ... iso_sock_sendmsg:1098: sock 00000000dea5e2e0, sk 00000000eabd6557 BUG: kernel NULL pointer dereference, address: 0000000000000668 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 RIP: 0010:iso_sock_sendmsg (net/bluetooth/iso.c:1112) bluetooth ========================================================== L2CAP: ================================================================== hci_cmd_status_evt:4359: hci0: opcode 0x0406 hci_cs_disconnect:2760: hci0: status 0x0c hci_sent_cmd_data:3085: hci0 opcode 0x0406 hci_conn_del:1151: hci0 hcon ffff88800c999000 handle 3585 hci_conn_unlink:1102: hci0: hcon ffff88800c999000 hci_chan_list_flush:2780: hcon ffff88800c999000 hci_chan_del:2761: hci0 hcon ffff88800c999000 chan ffff888018ddd280 ... BUG: KASAN: slab-use-after-free in hci_send_acl+0x2d/0x540 [bluetooth] Read of size 8 at addr ffff888018ddd298 by task bluetoothd/1175 CPU: 0 PID: 1175 Comm: bluetoothd Tainted: G E 6.4.0-rc4+ #2 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 Call Trace: dump_stack_lvl+0x5b/0x90 print_report+0xcf/0x670 ? __virt_addr_valid+0xf8/0x180 ? hci_send_acl+0x2d/0x540 [bluetooth] kasan_report+0xa8/0xe0 ? hci_send_acl+0x2d/0x540 [bluetooth] hci_send_acl+0x2d/0x540 [bluetooth] ? __pfx___lock_acquire+0x10/0x10 l2cap_chan_send+0x1fd/0x1300 [bluetooth] ? l2cap_sock_sendmsg+0xf2/0x170 [bluetooth] ? __pfx_l2cap_chan_send+0x10/0x10 [bluetooth] ? lock_release+0x1d5/0x3c0 ? mark_held_locks+0x1a/0x90 l2cap_sock_sendmsg+0x100/0x170 [bluetooth] sock_write_iter+0x275/0x280 ? __pfx_sock_write_iter+0x10/0x10 ? __pfx___lock_acquire+0x10/0x10 do_iter_readv_writev+0x176/0x220 ? __pfx_do_iter_readv_writev+0x10/0x10 ? find_held_lock+0x83/0xa0 ? selinux_file_permission+0x13e/0x210 do_iter_write+0xda/0x340 vfs_writev+0x1b4/0x400 ? __pfx_vfs_writev+0x10/0x10 ? __seccomp_filter+0x112/0x750 ? populate_seccomp_data+0x182/0x220 ? __fget_light+0xdf/0x100 ? do_writev+0x19d/0x210 do_writev+0x19d/0x210 ? __pfx_do_writev+0x10/0x10 ? mark_held_locks+0x1a/0x90 do_syscall_64+0x60/0x90 ? lockdep_hardirqs_on_prepare+0x149/0x210 ? do_syscall_64+0x6c/0x90 ? lockdep_hardirqs_on_prepare+0x149/0x210 entry_SYSCALL_64_after_hwframe+0x72/0xdc RIP: 0033:0x7ff45cb23e64 Code: 15 d1 1f 0d 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b8 0f 1f 00 f3 0f 1e fa 80 3d 9d a7 0d 00 00 74 13 b8 14 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 48 83 ec 28 89 54 24 1c 48 89 RSP: 002b:00007fff21ae09b8 EFLAGS: 00000202 ORIG_RAX: 0000000000000014 RAX: ffffffffffffffda RBX: 0000000000000001 RCX: 00007ff45cb23e64 RDX: 0000000000000001 RSI: 00007fff21ae0aa0 RDI: 0000000000000017 RBP: 00007fff21ae0aa0 R08: 000000000095a8a0 R09: 0000607000053f40 R10: 0000000000000001 R11: 0000000000000202 R12: 00007fff21ae0ac0 R13: 00000fffe435c150 R14: 00007fff21ae0a80 R15: 000060f000000040 Allocated by task 771: kasan_save_stack+0x33/0x60 kasan_set_track+0x25/0x30 __kasan_kmalloc+0xaa/0xb0 hci_chan_create+0x67/0x1b0 [bluetooth] l2cap_conn_add.part.0+0x17/0x590 [bluetooth] l2cap_connect_cfm+0x266/0x6b0 [bluetooth] hci_le_remote_feat_complete_evt+0x167/0x310 [bluetooth] hci_event_packet+0x38d/0x800 [bluetooth] hci_rx_work+0x287/0xb20 [bluetooth] process_one_work+0x4f7/0x970 worker_thread+0x8f/0x620 kthread+0x17f/0x1c0 ret_from_fork+0x2c/0x50 Freed by task 771: kasan_save_stack+0x33/0x60 kasan_set_track+0x25/0x30 kasan_save_free_info+0x2e/0x50 ____kasan_slab_free+0x169/0x1c0 slab_free_freelist_hook+0x9e/0x1c0 __kmem_cache_free+0xc0/0x310 hci_chan_list_flush+0x46/0x90 [bluetooth] hci_conn_cleanup+0x7d/0x330 [bluetooth] hci_cs_disconnect+0x35d/0x530 [bluetooth] hci_cmd_status_evt+0xef/0x2b0 [bluetooth] hci_event_packet+0x38d/0x800 [bluetooth] hci_rx_work+0x287/0xb20 [bluetooth] process_one_work+0x4f7/0x970 worker_thread+0x8f/0x620 kthread+0x17f/0x1c0 ret_from_fork+0x2c/0x50 ================================================================== Fixes: b8d290525e39 ("Bluetooth: clean up connection in hci_cs_disconnect") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_event.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index fbc5cb8548f2..31ca320ce38d 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -2784,6 +2784,9 @@ static void hci_cs_disconnect(struct hci_dev *hdev, u8 status) hci_enable_advertising(hdev); } + /* Inform sockets conn is gone before we delete it */ + hci_disconn_cfm(conn, HCI_ERROR_UNSPECIFIED); + goto done; } -- cgit v1.2.3 From d40ae85ee62e3666f45bc61864b22121346f88ef Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 19 Jun 2023 01:04:33 +0300 Subject: Bluetooth: ISO: fix iso_conn related locking and validity issues sk->sk_state indicates whether iso_pi(sk)->conn is valid. Operations that check/update sk_state and access conn should hold lock_sock, otherwise they can race. The order of taking locks is hci_dev_lock > lock_sock > iso_conn_lock, which is how it is in connect/disconnect_cfm -> iso_conn_del -> iso_chan_del. Fix locking in iso_connect_cis/bis and sendmsg/recvmsg to take lock_sock around updating sk_state and conn. iso_conn_del must not occur during iso_connect_cis/bis, as it frees the iso_conn. Hold hdev->lock longer to prevent that. This should not reintroduce the issue fixed in commit 241f51931c35 ("Bluetooth: ISO: Avoid circular locking dependency"), since the we acquire locks in order. We retain the fix in iso_sock_connect to release lock_sock before iso_connect_* acquires hdev->lock. Similarly for commit 6a5ad251b7cd ("Bluetooth: ISO: Fix possible circular locking dependency"). We retain the fix in iso_conn_ready to not acquire iso_conn_lock before lock_sock. iso_conn_add shall return iso_conn with valid hcon. Make it so also when reusing an old CIS connection waiting for disconnect timeout (see __iso_sock_close where conn->hcon is set to NULL). Trace with iso_conn_del after iso_chan_add in iso_connect_cis: =============================================================== iso_sock_create:771: sock 00000000be9b69b7 iso_sock_init:693: sk 000000004dff667e iso_sock_bind:827: sk 000000004dff667e 70:1a:b8:98:ff:a2 type 1 iso_sock_setsockopt:1289: sk 000000004dff667e iso_sock_setsockopt:1289: sk 000000004dff667e iso_sock_setsockopt:1289: sk 000000004dff667e iso_sock_connect:875: sk 000000004dff667e iso_connect_cis:353: 70:1a:b8:98:ff:a2 -> 28:3d:c2:4a:7e:da hci_get_route:1199: 70:1a:b8:98:ff:a2 -> 28:3d:c2:4a:7e:da hci_conn_add:1005: hci0 dst 28:3d:c2:4a:7e:da iso_conn_add:140: hcon 000000007b65d182 conn 00000000daf8625e __iso_chan_add:214: conn 00000000daf8625e iso_connect_cfm:1700: hcon 000000007b65d182 bdaddr 28:3d:c2:4a:7e:da status 12 iso_conn_del:187: hcon 000000007b65d182 conn 00000000daf8625e, err 16 iso_sock_clear_timer:117: sock 000000004dff667e state 3 iso_chan_del:153: sk 000000004dff667e, conn 00000000daf8625e, err 16 hci_conn_del:1151: hci0 hcon 000000007b65d182 handle 65535 hci_conn_unlink:1102: hci0: hcon 000000007b65d182 hci_chan_list_flush:2780: hcon 000000007b65d182 iso_sock_getsockopt:1376: sk 000000004dff667e iso_sock_getname:1070: sock 00000000be9b69b7, sk 000000004dff667e iso_sock_getname:1070: sock 00000000be9b69b7, sk 000000004dff667e iso_sock_getsockopt:1376: sk 000000004dff667e iso_sock_getname:1070: sock 00000000be9b69b7, sk 000000004dff667e iso_sock_getname:1070: sock 00000000be9b69b7, sk 000000004dff667e iso_sock_shutdown:1434: sock 00000000be9b69b7, sk 000000004dff667e, how 1 __iso_sock_close:632: sk 000000004dff667e state 5 socket 00000000be9b69b7 BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 8000000006467067 P4D 8000000006467067 PUD 3f5f067 PMD 0 Oops: 0000 [#1] PREEMPT SMP PTI Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 RIP: 0010:__iso_sock_close (net/bluetooth/iso.c:664) bluetooth =============================================================== Trace with iso_conn_del before iso_chan_add in iso_connect_cis: =============================================================== iso_connect_cis:356: 70:1a:b8:98:ff:a2 -> 28:3d:c2:4a:7e:da ... iso_conn_add:140: hcon 0000000093bc551f conn 00000000768ae504 hci_dev_put:1487: hci0 orig refcnt 21 hci_event_packet:7607: hci0: event 0x0e hci_cmd_complete_evt:4231: hci0: opcode 0x2062 hci_cc_le_set_cig_params:3846: hci0: status 0x07 hci_sent_cmd_data:3107: hci0 opcode 0x2062 iso_connect_cfm:1703: hcon 0000000093bc551f bdaddr 28:3d:c2:4a:7e:da status 7 iso_conn_del:187: hcon 0000000093bc551f conn 00000000768ae504, err 12 hci_conn_del:1151: hci0 hcon 0000000093bc551f handle 65535 hci_conn_unlink:1102: hci0: hcon 0000000093bc551f hci_chan_list_flush:2780: hcon 0000000093bc551f __iso_chan_add:214: conn 00000000768ae504 iso_sock_clear_timer:117: sock 0000000098323f95 state 3 general protection fault, probably for non-canonical address 0x30b29c630930aec8: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1920 Comm: bluetoothd Tainted: G E 6.3.0-rc7+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 RIP: 0010:detach_if_pending+0x28/0xd0 Code: 90 90 0f 1f 44 00 00 48 8b 47 08 48 85 c0 0f 84 ad 00 00 00 55 89 d5 53 48 83 3f 00 48 89 fb 74 7d 66 90 48 8b 03 48 8b 53 08 <> RSP: 0018:ffffb90841a67d08 EFLAGS: 00010007 RAX: 0000000000000000 RBX: ffff9141bd5061b8 RCX: 0000000000000000 RDX: 30b29c630930aec8 RSI: ffff9141fdd21e80 RDI: ffff9141bd5061b8 RBP: 0000000000000001 R08: 0000000000000000 R09: ffffb90841a67b88 R10: 0000000000000003 R11: ffffffff8613f558 R12: ffff9141fdd21e80 R13: 0000000000000000 R14: ffff9141b5976010 R15: ffff914185755338 FS: 00007f45768bd840(0000) GS:ffff9141fdd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000619000424074 CR3: 0000000009f5e005 CR4: 0000000000170ee0 Call Trace: timer_delete+0x48/0x80 try_to_grab_pending+0xdf/0x170 __cancel_work+0x37/0xb0 iso_connect_cis+0x141/0x400 [bluetooth] =============================================================== Trace with NULL conn->hcon in state BT_CONNECT: =============================================================== __iso_sock_close:619: sk 00000000f7c71fc5 state 1 socket 00000000d90c5fe5 ... __iso_sock_close:619: sk 00000000f7c71fc5 state 8 socket 00000000d90c5fe5 iso_chan_del:153: sk 00000000f7c71fc5, conn 0000000022c03a7e, err 104 ... iso_sock_connect:862: sk 00000000129b56c3 iso_connect_cis:348: 70:1a:b8:98:ff:a2 -> 28:3d:c2:4a:7d:2a hci_get_route:1199: 70:1a:b8:98:ff:a2 -> 28:3d:c2:4a:7d:2a hci_dev_hold:1495: hci0 orig refcnt 19 __iso_chan_add:214: conn 0000000022c03a7e iso_sock_clear_timer:117: sock 00000000129b56c3 state 3 ... iso_sock_ready:1485: sk 00000000129b56c3 ... iso_sock_sendmsg:1077: sock 00000000e5013966, sk 00000000129b56c3 BUG: kernel NULL pointer dereference, address: 00000000000006a8 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP PTI CPU: 1 PID: 1403 Comm: wireplumber Tainted: G E 6.3.0-rc7+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.2-1.fc38 04/01/2014 RIP: 0010:iso_sock_sendmsg+0x63/0x2a0 [bluetooth] =============================================================== Fixes: 241f51931c35 ("Bluetooth: ISO: Avoid circular locking dependency") Fixes: 6a5ad251b7cd ("Bluetooth: ISO: Fix possible circular locking dependency") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/iso.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 0e6cc57b3911..505d62247268 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -123,8 +123,11 @@ static struct iso_conn *iso_conn_add(struct hci_conn *hcon) { struct iso_conn *conn = hcon->iso_data; - if (conn) + if (conn) { + if (!conn->hcon) + conn->hcon = hcon; return conn; + } conn = kzalloc(sizeof(*conn), GFP_KERNEL); if (!conn) @@ -300,14 +303,13 @@ static int iso_connect_bis(struct sock *sk) goto unlock; } - hci_dev_unlock(hdev); - hci_dev_put(hdev); + lock_sock(sk); err = iso_chan_add(conn, sk, NULL); - if (err) - return err; - - lock_sock(sk); + if (err) { + release_sock(sk); + goto unlock; + } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -321,7 +323,6 @@ static int iso_connect_bis(struct sock *sk) } release_sock(sk); - return err; unlock: hci_dev_unlock(hdev); @@ -389,14 +390,13 @@ static int iso_connect_cis(struct sock *sk) goto unlock; } - hci_dev_unlock(hdev); - hci_dev_put(hdev); + lock_sock(sk); err = iso_chan_add(conn, sk, NULL); - if (err) - return err; - - lock_sock(sk); + if (err) { + release_sock(sk); + goto unlock; + } /* Update source addr of the socket */ bacpy(&iso_pi(sk)->src, &hcon->src); @@ -413,7 +413,6 @@ static int iso_connect_cis(struct sock *sk) } release_sock(sk); - return err; unlock: hci_dev_unlock(hdev); @@ -1072,8 +1071,8 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { struct sock *sk = sock->sk; - struct iso_conn *conn = iso_pi(sk)->conn; struct sk_buff *skb, **frag; + size_t mtu; int err; BT_DBG("sock %p, sk %p", sock, sk); @@ -1085,11 +1084,18 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; - if (sk->sk_state != BT_CONNECTED) + lock_sock(sk); + + if (sk->sk_state != BT_CONNECTED) { + release_sock(sk); return -ENOTCONN; + } + + mtu = iso_pi(sk)->conn->hcon->hdev->iso_mtu; + + release_sock(sk); - skb = bt_skb_sendmsg(sk, msg, len, conn->hcon->hdev->iso_mtu, - HCI_ISO_DATA_HDR_SIZE, 0); + skb = bt_skb_sendmsg(sk, msg, len, mtu, HCI_ISO_DATA_HDR_SIZE, 0); if (IS_ERR(skb)) return PTR_ERR(skb); @@ -1102,8 +1108,7 @@ static int iso_sock_sendmsg(struct socket *sock, struct msghdr *msg, while (len) { struct sk_buff *tmp; - tmp = bt_skb_sendmsg(sk, msg, len, conn->hcon->hdev->iso_mtu, - 0, 0); + tmp = bt_skb_sendmsg(sk, msg, len, mtu, 0, 0); if (IS_ERR(tmp)) { kfree_skb(skb); return PTR_ERR(tmp); @@ -1158,15 +1163,19 @@ static int iso_sock_recvmsg(struct socket *sock, struct msghdr *msg, BT_DBG("sk %p", sk); if (test_and_clear_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags)) { + lock_sock(sk); switch (sk->sk_state) { case BT_CONNECT2: - lock_sock(sk); iso_conn_defer_accept(pi->conn->hcon); sk->sk_state = BT_CONFIG; release_sock(sk); return 0; case BT_CONNECT: + release_sock(sk); return iso_connect_cis(sk); + default: + release_sock(sk); + break; } } -- cgit v1.2.3 From 6910e2eb39254d279bce5bc0f8eb6af45b59357c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 3 Jul 2023 13:30:48 +0200 Subject: Bluetooth: coredump: fix building with coredump disabled The btmtk driver uses an IS_ENABLED() check to conditionally compile the coredump support, but this fails to build because the hdev->dump member is in an #ifdef: drivers/bluetooth/btmtk.c: In function 'btmtk_process_coredump': drivers/bluetooth/btmtk.c:386:30: error: 'struct hci_dev' has no member named 'dump' 386 | schedule_delayed_work(&hdev->dump.dump_timeout, | ^~ The struct member doesn't really make a huge difference in the total size, so just remove the #ifdef around it to avoid adding similar checks around each user. Fixes: 872f8c253cb9e ("Bluetooth: btusb: mediatek: add MediaTek devcoredump support") Fixes: 9695ef876fd12 ("Bluetooth: Add support for hci devcoredump") Signed-off-by: Arnd Bergmann Signed-off-by: Luiz Augusto von Dentz --- include/net/bluetooth/hci_core.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 870b6d3c5146..e01d52cb668c 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -593,9 +593,7 @@ struct hci_dev { const char *fw_info; struct dentry *debugfs; -#ifdef CONFIG_DEV_COREDUMP struct hci_devcoredump dump; -#endif struct device dev; -- cgit v1.2.3 From de6dfcefd107667ce2dbedf4d9337f5ed557a4a1 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Fri, 30 Jun 2023 15:33:14 -0700 Subject: Bluetooth: hci_sync: Avoid use-after-free in dbg for hci_remove_adv_monitor() KASAN reports that there's a use-after-free in hci_remove_adv_monitor(). Trawling through the disassembly, you can see that the complaint is from the access in bt_dev_dbg() under the HCI_ADV_MONITOR_EXT_MSFT case. The problem case happens because msft_remove_monitor() can end up freeing the monitor structure. Specifically: hci_remove_adv_monitor() -> msft_remove_monitor() -> msft_remove_monitor_sync() -> msft_le_cancel_monitor_advertisement_cb() -> hci_free_adv_monitor() Let's fix the problem by just stashing the relevant data when it's still valid. Fixes: 7cf5c2978f23 ("Bluetooth: hci_sync: Refactor remove Adv Monitor") Signed-off-by: Douglas Anderson Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b421e196f60c..1ec83985f1ab 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -1972,6 +1972,7 @@ static int hci_remove_adv_monitor(struct hci_dev *hdev, struct adv_monitor *monitor) { int status = 0; + int handle; switch (hci_get_adv_monitor_offload_ext(hdev)) { case HCI_ADV_MONITOR_EXT_NONE: /* also goes here when powered off */ @@ -1980,9 +1981,10 @@ static int hci_remove_adv_monitor(struct hci_dev *hdev, goto free_monitor; case HCI_ADV_MONITOR_EXT_MSFT: + handle = monitor->handle; status = msft_remove_monitor(hdev, monitor); bt_dev_dbg(hdev, "%s remove monitor %d msft status %d", - hdev->name, monitor->handle, status); + hdev->name, handle, status); break; } -- cgit v1.2.3 From b4066eb04bb67e7ff66e5aaab0db4a753f37eaad Mon Sep 17 00:00:00 2001 From: Siddh Raman Pant Date: Tue, 11 Jul 2023 18:43:53 +0530 Subject: Bluetooth: hci_conn: return ERR_PTR instead of NULL when there is no link hci_connect_sco currently returns NULL when there is no link (i.e. when hci_conn_link() returns NULL). sco_connect() expects an ERR_PTR in case of any error (see line 266 in sco.c). Thus, hcon set as NULL passes through to sco_conn_add(), which tries to get hcon->hdev, resulting in dereferencing a NULL pointer as reported by syzkaller. The same issue exists for iso_connect_cis() calling hci_connect_cis(). Thus, make hci_connect_sco() and hci_connect_cis() return ERR_PTR instead of NULL. Reported-and-tested-by: syzbot+37acd5d80d00d609d233@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=37acd5d80d00d609d233 Fixes: 06149746e720 ("Bluetooth: hci_conn: Add support for linking multiple hcon") Signed-off-by: Siddh Raman Pant Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/hci_conn.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index db598942cab4..76222565e2df 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -1684,7 +1684,7 @@ struct hci_conn *hci_connect_sco(struct hci_dev *hdev, int type, bdaddr_t *dst, if (!link) { hci_conn_drop(acl); hci_conn_drop(sco); - return NULL; + return ERR_PTR(-ENOLINK); } sco->setting = setting; @@ -2254,7 +2254,7 @@ struct hci_conn *hci_connect_cis(struct hci_dev *hdev, bdaddr_t *dst, if (!link) { hci_conn_drop(le); hci_conn_drop(cis); - return NULL; + return ERR_PTR(-ENOLINK); } /* If LE is already connected and CIS handle is already set proceed to -- cgit v1.2.3 From 3dcaa192ac2159193bc6ab57bc5369dcb84edd8e Mon Sep 17 00:00:00 2001 From: Pauli Virtanen Date: Mon, 10 Jul 2023 19:48:19 +0300 Subject: Bluetooth: SCO: fix sco_conn related locking and validity issues Operations that check/update sk_state and access conn should hold lock_sock, otherwise they can race. The order of taking locks is hci_dev_lock > lock_sock > sco_conn_lock, which is how it is in connect/disconnect_cfm -> sco_conn_del -> sco_chan_del. Fix locking in sco_connect to take lock_sock around updating sk_state and conn. sco_conn_del must not occur during sco_connect, as it frees the sco_conn. Hold hdev->lock longer to prevent that. sco_conn_add shall return sco_conn with valid hcon. Make it so also when reusing an old SCO connection waiting for disconnect timeout (see __sco_sock_close where conn->hcon is set to NULL). This should not reintroduce the issue fixed in the earlier commit 9a8ec9e8ebb5 ("Bluetooth: SCO: Fix possible circular locking dependency on sco_connect_cfm"), the relevant fix of releasing lock_sock in sco_sock_connect before acquiring hdev->lock is retained. These changes mirror similar fixes earlier in ISO sockets. Fixes: 9a8ec9e8ebb5 ("Bluetooth: SCO: Fix possible circular locking dependency on sco_connect_cfm") Signed-off-by: Pauli Virtanen Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/sco.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index cd1a27ac555d..7762604ddfc0 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -126,8 +126,11 @@ static struct sco_conn *sco_conn_add(struct hci_conn *hcon) struct hci_dev *hdev = hcon->hdev; struct sco_conn *conn = hcon->sco_data; - if (conn) + if (conn) { + if (!conn->hcon) + conn->hcon = hcon; return conn; + } conn = kzalloc(sizeof(struct sco_conn), GFP_KERNEL); if (!conn) @@ -268,21 +271,21 @@ static int sco_connect(struct sock *sk) goto unlock; } - hci_dev_unlock(hdev); - hci_dev_put(hdev); - conn = sco_conn_add(hcon); if (!conn) { hci_conn_drop(hcon); - return -ENOMEM; + err = -ENOMEM; + goto unlock; } - err = sco_chan_add(conn, sk, NULL); - if (err) - return err; - lock_sock(sk); + err = sco_chan_add(conn, sk, NULL); + if (err) { + release_sock(sk); + goto unlock; + } + /* Update source addr of the socket */ bacpy(&sco_pi(sk)->src, &hcon->src); @@ -296,8 +299,6 @@ static int sco_connect(struct sock *sk) release_sock(sk); - return err; - unlock: hci_dev_unlock(hdev); hci_dev_put(hdev); -- cgit v1.2.3 From 95b7015433053cd5f648ad2a7b8f43b2c99c949a Mon Sep 17 00:00:00 2001 From: Tomasz Moń Date: Thu, 13 Jul 2023 12:25:14 +0200 Subject: Bluetooth: btusb: Fix bluetooth on Intel Macbook 2014 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit c13380a55522 ("Bluetooth: btusb: Do not require hardcoded interface numbers") inadvertedly broke bluetooth on Intel Macbook 2014. The intention was to keep behavior intact when BTUSB_IFNUM_2 is set and otherwise allow any interface numbers. The problem is that the new logic condition omits the case where bInterfaceNumber is 0. Fix BTUSB_IFNUM_2 handling by allowing both interface number 0 and 2 when the flag is set. Fixes: c13380a55522 ("Bluetooth: btusb: Do not require hardcoded interface numbers") Reported-by: John Holland Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217651 Signed-off-by: Tomasz Moń Tested-by: John Holland Signed-off-by: Luiz Augusto von Dentz --- drivers/bluetooth/btusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 5ec4ad0a5c86..764d176e9735 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -4104,6 +4104,7 @@ static int btusb_probe(struct usb_interface *intf, BT_DBG("intf %p id %p", intf, id); if ((id->driver_info & BTUSB_IFNUM_2) && + (intf->cur_altsetting->desc.bInterfaceNumber != 0) && (intf->cur_altsetting->desc.bInterfaceNumber != 2)) return -ENODEV; -- cgit v1.2.3 From d1f0a9816f5fbc1316355ec1aa4ddfb9b624cca5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 17 Jul 2023 12:32:14 +0300 Subject: Bluetooth: MGMT: Use correct address for memcpy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In function ‘fortify_memcpy_chk’, inlined from ‘get_conn_info_complete’ at net/bluetooth/mgmt.c:7281:2: include/linux/fortify-string.h:592:25: error: call to ‘__read_overflow2_field’ declared with attribute warning: detected read beyond size of field (2nd parameter); maybe use struct_group()? [-Werror=attribute-warning] 592 | __read_overflow2_field(q_size_field, size); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ cc1: all warnings being treated as errors This is due to the wrong member is used for memcpy(). Use correct one. Signed-off-by: Andy Shevchenko Signed-off-by: Luiz Augusto von Dentz --- net/bluetooth/mgmt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 1e07d0f28972..d4498037fadc 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -7285,7 +7285,7 @@ static void get_conn_info_complete(struct hci_dev *hdev, void *data, int err) bt_dev_dbg(hdev, "err %d", err); - memcpy(&rp.addr, &cp->addr.bdaddr, sizeof(rp.addr)); + memcpy(&rp.addr, &cp->addr, sizeof(rp.addr)); status = mgmt_status(err); if (status == MGMT_STATUS_SUCCESS) { -- cgit v1.2.3 From 348b81b68b13ebd489a3e6a46aa1c384c731c919 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:47 +0000 Subject: tcp: annotate data-races around tp->tcp_tx_delay do_tcp_getsockopt() reads tp->tcp_tx_delay while another cpu might change its value. Fixes: a842fe1425cb ("tcp: add optional per socket transmit delay") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index e03e08745308..bd6400e1ae9f 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3674,7 +3674,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_TX_DELAY: if (val) tcp_enable_tx_delay(); - tp->tcp_tx_delay = val; + WRITE_ONCE(tp->tcp_tx_delay, val); break; default: err = -ENOPROTOOPT; @@ -4154,7 +4154,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TX_DELAY: - val = tp->tcp_tx_delay; + val = READ_ONCE(tp->tcp_tx_delay); break; case TCP_TIMESTAMP: -- cgit v1.2.3 From dd23c9f1e8d5c1d2e3d29393412385ccb9c7a948 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:48 +0000 Subject: tcp: annotate data-races around tp->tsoffset do_tcp_getsockopt() reads tp->tsoffset while another cpu might change its value. Fixes: 93be6ce0e91b ("tcp: set and get per-socket timestamp") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 4 ++-- net/ipv4/tcp_ipv4.c | 5 +++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd6400e1ae9f..03ae6554c78d 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3656,7 +3656,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (!tp->repair) err = -EPERM; else - tp->tsoffset = val - tcp_time_stamp_raw(); + WRITE_ONCE(tp->tsoffset, val - tcp_time_stamp_raw()); break; case TCP_REPAIR_WINDOW: err = tcp_repair_set_window(tp, optval, optlen); @@ -4158,7 +4158,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_TIMESTAMP: - val = tcp_time_stamp_raw() + tp->tsoffset; + val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset); break; case TCP_NOTSENT_LOWAT: val = tp->notsent_lowat; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index b5c81cf5b86f..069642014636 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -307,8 +307,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, usin->sin_port)); - tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr, - inet->inet_daddr); + WRITE_ONCE(tp->tsoffset, + secure_tcp_ts_off(net, inet->inet_saddr, + inet->inet_daddr)); } inet->inet_id = get_random_u16(); -- cgit v1.2.3 From 4164245c76ff906c9086758e1c3f87082a7f5ef5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:49 +0000 Subject: tcp: annotate data-races around tp->keepalive_time do_tcp_getsockopt() reads tp->keepalive_time while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 7 +++++-- net/ipv4/tcp.c | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 226bce6d1e8c..9e1c3337cdfe 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1517,9 +1517,12 @@ static inline int keepalive_intvl_when(const struct tcp_sock *tp) static inline int keepalive_time_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; - return tp->keepalive_time ? : - READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); + /* Paired with WRITE_ONCE() in tcp_sock_set_keepidle_locked() */ + val = READ_ONCE(tp->keepalive_time); + + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); } static inline int keepalive_probes(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 03ae6554c78d..b4f7856dfb16 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3312,7 +3312,8 @@ int tcp_sock_set_keepidle_locked(struct sock *sk, int val) if (val < 1 || val > MAX_TCP_KEEPIDLE) return -EINVAL; - tp->keepalive_time = val * HZ; + /* Paired with WRITE_ONCE() in keepalive_time_when() */ + WRITE_ONCE(tp->keepalive_time, val * HZ); if (sock_flag(sk, SOCK_KEEPOPEN) && !((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) { u32 elapsed = keepalive_time_elapsed(tp); -- cgit v1.2.3 From 5ecf9d4f52ff2f1d4d44c9b68bc75688e82f13b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:50 +0000 Subject: tcp: annotate data-races around tp->keepalive_intvl do_tcp_getsockopt() reads tp->keepalive_intvl while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 +++++++-- net/ipv4/tcp.c | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 9e1c3337cdfe..2d8d58dec652 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1509,9 +1509,14 @@ void tcp_leave_memory_pressure(struct sock *sk); static inline int keepalive_intvl_when(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; + + /* Paired with WRITE_ONCE() in tcp_sock_set_keepintvl() + * and do_tcp_setsockopt(). + */ + val = READ_ONCE(tp->keepalive_intvl); - return tp->keepalive_intvl ? : - READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_intvl); } static inline int keepalive_time_when(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index b4f7856dfb16..d55fe014e7c9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3345,7 +3345,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val) return -EINVAL; lock_sock(sk); - tcp_sk(sk)->keepalive_intvl = val * HZ; + WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ); release_sock(sk); return 0; } @@ -3559,7 +3559,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (val < 1 || val > MAX_TCP_KEEPINTVL) err = -EINVAL; else - tp->keepalive_intvl = val * HZ; + WRITE_ONCE(tp->keepalive_intvl, val * HZ); break; case TCP_KEEPCNT: if (val < 1 || val > MAX_TCP_KEEPCNT) -- cgit v1.2.3 From 6e5e1de616bf5f3df1769abc9292191dfad9110a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:51 +0000 Subject: tcp: annotate data-races around tp->keepalive_probes do_tcp_getsockopt() reads tp->keepalive_probes while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-6-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 9 +++++++-- net/ipv4/tcp.c | 5 +++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 2d8d58dec652..a8b93b2875ea 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -1533,9 +1533,14 @@ static inline int keepalive_time_when(const struct tcp_sock *tp) static inline int keepalive_probes(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); + int val; + + /* Paired with WRITE_ONCE() in tcp_sock_set_keepcnt() + * and do_tcp_setsockopt(). + */ + val = READ_ONCE(tp->keepalive_probes); - return tp->keepalive_probes ? : - READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); + return val ? : READ_ONCE(net->ipv4.sysctl_tcp_keepalive_probes); } static inline u32 keepalive_time_elapsed(const struct tcp_sock *tp) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index d55fe014e7c9..574fd0da1673 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3357,7 +3357,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val) return -EINVAL; lock_sock(sk); - tcp_sk(sk)->keepalive_probes = val; + /* Paired with READ_ONCE() in keepalive_probes() */ + WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val); release_sock(sk); return 0; } @@ -3565,7 +3566,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (val < 1 || val > MAX_TCP_KEEPCNT) err = -EINVAL; else - tp->keepalive_probes = val; + WRITE_ONCE(tp->keepalive_probes, val); break; case TCP_SYNCNT: if (val < 1 || val > MAX_TCP_SYNCNT) -- cgit v1.2.3 From 3a037f0f3c4bfe44518f2fbb478aa2f99a9cd8bb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:52 +0000 Subject: tcp: annotate data-races around icsk->icsk_syn_retries do_tcp_getsockopt() and reqsk_timer_handler() read icsk->icsk_syn_retries while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-7-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/tcp.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 0cc19cfbb673..aeebe8816689 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -1019,7 +1019,7 @@ static void reqsk_timer_handler(struct timer_list *t) icsk = inet_csk(sk_listener); net = sock_net(sk_listener); - max_syn_ack_retries = icsk->icsk_syn_retries ? : + max_syn_ack_retries = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_synack_retries); /* Normally all the openreqs are young and become mature * (i.e. converted to established socket) for first timeout. diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 574fd0da1673..9f74ac16f1c1 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3291,7 +3291,7 @@ int tcp_sock_set_syncnt(struct sock *sk, int val) return -EINVAL; lock_sock(sk); - inet_csk(sk)->icsk_syn_retries = val; + WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val); release_sock(sk); return 0; } @@ -3572,7 +3572,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (val < 1 || val > MAX_TCP_SYNCNT) err = -EINVAL; else - icsk->icsk_syn_retries = val; + WRITE_ONCE(icsk->icsk_syn_retries, val); break; case TCP_SAVE_SYN: @@ -3993,7 +3993,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, val = keepalive_probes(tp); break; case TCP_SYNCNT: - val = icsk->icsk_syn_retries ? : + val = READ_ONCE(icsk->icsk_syn_retries) ? : READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: -- cgit v1.2.3 From 9df5335ca974e688389c875546e5819778a80d59 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:53 +0000 Subject: tcp: annotate data-races around tp->linger2 do_tcp_getsockopt() reads tp->linger2 while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-8-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9f74ac16f1c1..2cf129a0c00b 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3585,11 +3585,11 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_LINGER2: if (val < 0) - tp->linger2 = -1; + WRITE_ONCE(tp->linger2, -1); else if (val > TCP_FIN_TIMEOUT_MAX / HZ) - tp->linger2 = TCP_FIN_TIMEOUT_MAX; + WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX); else - tp->linger2 = val * HZ; + WRITE_ONCE(tp->linger2, val * HZ); break; case TCP_DEFER_ACCEPT: @@ -3997,7 +3997,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, READ_ONCE(net->ipv4.sysctl_tcp_syn_retries); break; case TCP_LINGER2: - val = tp->linger2; + val = READ_ONCE(tp->linger2); if (val >= 0) val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; -- cgit v1.2.3 From ae488c74422fb1dcd807c0201804b3b5e8a322a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:54 +0000 Subject: tcp: annotate data-races around rskq_defer_accept do_tcp_getsockopt() reads rskq_defer_accept while another cpu might change its value. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-9-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2cf129a0c00b..5beec71a5c41 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3594,9 +3594,9 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, case TCP_DEFER_ACCEPT: /* Translate value in seconds to number of retransmits */ - icsk->icsk_accept_queue.rskq_defer_accept = - secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, - TCP_RTO_MAX / HZ); + WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept, + secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ)); break; case TCP_WINDOW_CLAMP: @@ -4002,8 +4002,9 @@ int do_tcp_getsockopt(struct sock *sk, int level, val = (val ? : READ_ONCE(net->ipv4.sysctl_tcp_fin_timeout)) / HZ; break; case TCP_DEFER_ACCEPT: - val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept, - TCP_TIMEOUT_INIT / HZ, TCP_RTO_MAX / HZ); + val = READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept); + val = retrans_to_secs(val, TCP_TIMEOUT_INIT / HZ, + TCP_RTO_MAX / HZ); break; case TCP_WINDOW_CLAMP: val = tp->window_clamp; -- cgit v1.2.3 From 1aeb87bc1440c5447a7fa2d6e3c2cca52cbd206b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:55 +0000 Subject: tcp: annotate data-races around tp->notsent_lowat tp->notsent_lowat can be read locklessly from do_tcp_getsockopt() and tcp_poll(). Fixes: c9bee3b7fdec ("tcp: TCP_NOTSENT_LOWAT socket option") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-10-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/tcp.h | 6 +++++- net/ipv4/tcp.c | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index a8b93b2875ea..0ca972ebd3dd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -2061,7 +2061,11 @@ void __tcp_v4_send_check(struct sk_buff *skb, __be32 saddr, __be32 daddr); static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) { struct net *net = sock_net((struct sock *)tp); - return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); + u32 val; + + val = READ_ONCE(tp->notsent_lowat); + + return val ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); } bool tcp_stream_memory_free(const struct sock *sk, int wake); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 5beec71a5c41..2b2241e9b492 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3664,7 +3664,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, err = tcp_repair_set_window(tp, optval, optlen); break; case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; + WRITE_ONCE(tp->notsent_lowat, val); sk->sk_write_space(sk); break; case TCP_INQ: @@ -4164,7 +4164,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, val = tcp_time_stamp_raw() + READ_ONCE(tp->tsoffset); break; case TCP_NOTSENT_LOWAT: - val = tp->notsent_lowat; + val = READ_ONCE(tp->notsent_lowat); break; case TCP_INQ: val = tp->recvmsg_inq; -- cgit v1.2.3 From 26023e91e12c68669db416b97234328a03d8e499 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:56 +0000 Subject: tcp: annotate data-races around icsk->icsk_user_timeout This field can be read locklessly from do_tcp_getsockopt() Fixes: dca43c75e7e5 ("tcp: Add TCP_USER_TIMEOUT socket option.") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-11-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/tcp.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 2b2241e9b492..3e137e9a18f5 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -3300,7 +3300,7 @@ EXPORT_SYMBOL(tcp_sock_set_syncnt); void tcp_sock_set_user_timeout(struct sock *sk, u32 val) { lock_sock(sk); - inet_csk(sk)->icsk_user_timeout = val; + WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val); release_sock(sk); } EXPORT_SYMBOL(tcp_sock_set_user_timeout); @@ -3620,7 +3620,7 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (val < 0) err = -EINVAL; else - icsk->icsk_user_timeout = val; + WRITE_ONCE(icsk->icsk_user_timeout, val); break; case TCP_FASTOPEN: @@ -4141,7 +4141,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_USER_TIMEOUT: - val = icsk->icsk_user_timeout; + val = READ_ONCE(icsk->icsk_user_timeout); break; case TCP_FASTOPEN: -- cgit v1.2.3 From 70f360dd7042cb843635ece9d28335a4addff9eb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 19 Jul 2023 21:28:57 +0000 Subject: tcp: annotate data-races around fastopenq.max_qlen This field can be read locklessly. Fixes: 1536e2857bd3 ("tcp: Add a TCP_FASTOPEN socket option to get a max backlog on its listner") Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20230719212857.3943972-12-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/linux/tcp.h | 2 +- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_fastopen.c | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index b4c08ac86983..91a37c99ba66 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -513,7 +513,7 @@ static inline void fastopen_queue_tune(struct sock *sk, int backlog) struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue; int somaxconn = READ_ONCE(sock_net(sk)->core.sysctl_somaxconn); - queue->fastopenq.max_qlen = min_t(unsigned int, backlog, somaxconn); + WRITE_ONCE(queue->fastopenq.max_qlen, min_t(unsigned int, backlog, somaxconn)); } static inline void tcp_move_syn(struct tcp_sock *tp, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3e137e9a18f5..8ed52e1e3c99 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -4145,7 +4145,7 @@ int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_FASTOPEN: - val = icsk->icsk_accept_queue.fastopenq.max_qlen; + val = READ_ONCE(icsk->icsk_accept_queue.fastopenq.max_qlen); break; case TCP_FASTOPEN_CONNECT: diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c index 45cc7f1ca296..85e4953f1182 100644 --- a/net/ipv4/tcp_fastopen.c +++ b/net/ipv4/tcp_fastopen.c @@ -296,6 +296,7 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, static bool tcp_fastopen_queue_check(struct sock *sk) { struct fastopen_queue *fastopenq; + int max_qlen; /* Make sure the listener has enabled fastopen, and we don't * exceed the max # of pending TFO requests allowed before trying @@ -308,10 +309,11 @@ static bool tcp_fastopen_queue_check(struct sock *sk) * temporarily vs a server not supporting Fast Open at all. */ fastopenq = &inet_csk(sk)->icsk_accept_queue.fastopenq; - if (fastopenq->max_qlen == 0) + max_qlen = READ_ONCE(fastopenq->max_qlen); + if (max_qlen == 0) return false; - if (fastopenq->qlen >= fastopenq->max_qlen) { + if (fastopenq->qlen >= max_qlen) { struct request_sock *req1; spin_lock(&fastopenq->lock); req1 = fastopenq->rskq_rst_head; -- cgit v1.2.3 From 1c613beaf877c0c0d755853dc62687e2013e55c4 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 20 Jul 2023 03:02:31 +0300 Subject: net: phy: prevent stale pointer dereference in phy_init() mdio_bus_init() and phy_driver_register() both have error paths, and if those are ever hit, ethtool will have a stale pointer to the phy_ethtool_phy_ops stub structure, which references memory from a module that failed to load (phylib). It is probably hard to force an error in this code path even manually, but the error teardown path of phy_init() should be the same as phy_exit(), which is now simply not the case. Fixes: 55d8f053ce1b ("net: phy: Register ethtool PHY operations") Link: https://lore.kernel.org/netdev/ZLaiJ4G6TaJYGJyU@shell.armlinux.org.uk/ Suggested-by: Russell King (Oracle) Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20230720000231.1939689-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_device.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index 0c2014accba7..61921d4dbb13 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -3451,23 +3451,30 @@ static int __init phy_init(void) { int rc; + ethtool_set_ethtool_phy_ops(&phy_ethtool_phy_ops); + rc = mdio_bus_init(); if (rc) - return rc; + goto err_ethtool_phy_ops; - ethtool_set_ethtool_phy_ops(&phy_ethtool_phy_ops); features_init(); rc = phy_driver_register(&genphy_c45_driver, THIS_MODULE); if (rc) - goto err_c45; + goto err_mdio_bus; rc = phy_driver_register(&genphy_driver, THIS_MODULE); - if (rc) { - phy_driver_unregister(&genphy_c45_driver); + if (rc) + goto err_c45; + + return 0; + err_c45: - mdio_bus_exit(); - } + phy_driver_unregister(&genphy_c45_driver); +err_mdio_bus: + mdio_bus_exit(); +err_ethtool_phy_ops: + ethtool_set_ethtool_phy_ops(NULL); return rc; } -- cgit v1.2.3