From aed06d36ba4e7fe90f2d4a85835cee7c80ea72a7 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Sat, 22 Feb 2025 01:35:30 +0000 Subject: ceph: Remove osd_client deadcode osd_req_op_extent_osd_data_pagelist() was added in 2013 as part of commit a4ce40a9a7c1 ("libceph: combine initializing and setting osd data") but never used. The last use of osd_req_op_cls_request_data_pagelist() was removed in 2017's commit ecd4a68a26a2 ("rbd: switch rbd_obj_method_sync() to ceph_osdc_call()") Remove them. Signed-off-by: Dr. David Alan Gilbert Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- include/linux/ceph/osd_client.h | 6 ------ net/ceph/osd_client.c | 23 ----------------------- 2 files changed, 29 deletions(-) diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index d55b30057a45..50b14a5661c7 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -490,9 +490,6 @@ extern void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages); -extern void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *, - unsigned int which, - struct ceph_pagelist *pagelist); #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, @@ -509,9 +506,6 @@ void osd_req_op_extent_osd_data_bvec_pos(struct ceph_osd_request *osd_req, void osd_req_op_extent_osd_iter(struct ceph_osd_request *osd_req, unsigned int which, struct iov_iter *iter); -extern void osd_req_op_cls_request_data_pagelist(struct ceph_osd_request *, - unsigned int which, - struct ceph_pagelist *pagelist); extern void osd_req_op_cls_request_data_pages(struct ceph_osd_request *, unsigned int which, struct page **pages, u64 length, diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index b24afec24138..6664ea73ccf8 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -220,16 +220,6 @@ void osd_req_op_extent_osd_data_pages(struct ceph_osd_request *osd_req, } EXPORT_SYMBOL(osd_req_op_extent_osd_data_pages); -void osd_req_op_extent_osd_data_pagelist(struct ceph_osd_request *osd_req, - unsigned int which, struct ceph_pagelist *pagelist) -{ - struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_data(osd_req, which, extent, osd_data); - ceph_osd_data_pagelist_init(osd_data, pagelist); -} -EXPORT_SYMBOL(osd_req_op_extent_osd_data_pagelist); - #ifdef CONFIG_BLOCK void osd_req_op_extent_osd_data_bio(struct ceph_osd_request *osd_req, unsigned int which, @@ -297,19 +287,6 @@ static void osd_req_op_cls_request_info_pagelist( ceph_osd_data_pagelist_init(osd_data, pagelist); } -void osd_req_op_cls_request_data_pagelist( - struct ceph_osd_request *osd_req, - unsigned int which, struct ceph_pagelist *pagelist) -{ - struct ceph_osd_data *osd_data; - - osd_data = osd_req_op_data(osd_req, which, cls, request_data); - ceph_osd_data_pagelist_init(osd_data, pagelist); - osd_req->r_ops[which].cls.indata_len += pagelist->length; - osd_req->r_ops[which].indata_len += pagelist->length; -} -EXPORT_SYMBOL(osd_req_op_cls_request_data_pagelist); - void osd_req_op_cls_request_data_pages(struct ceph_osd_request *osd_req, unsigned int which, struct page **pages, u64 length, u32 alignment, bool pages_from_pool, bool own_pages) -- cgit v1.2.3 From f452a2204614fc10e2c3b85904c4bd300c2789dc Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 12 Mar 2025 10:47:11 +0000 Subject: ceph: Fix incorrect flush end position calculation In ceph, in fill_fscrypt_truncate(), the end flush position is calculated by: loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; but that's using the block shift not the block size. Fix this to use the block size instead. Fixes: 5c64737d2536 ("ceph: add truncate size handling support for fscrypt") Signed-off-by: David Howells Reviewed-by: Viacheslav Dubeyko Signed-off-by: Ilya Dryomov --- fs/ceph/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7dd6c2275085..e3ab07797c85 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -2362,7 +2362,7 @@ static int fill_fscrypt_truncate(struct inode *inode, /* Try to writeback the dirty pagecaches */ if (issued & (CEPH_CAP_FILE_BUFFER)) { - loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; ret = filemap_write_and_wait_range(inode->i_mapping, orig_pos, lend); -- cgit v1.2.3 From 14c8a418159e541d70dbf8fc71225d1623beaf0f Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 20 Mar 2025 15:55:57 +0000 Subject: cpufreq: sun50i: prevent out-of-bounds access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A KASAN enabled kernel reports an out-of-bounds access when handling the nvmem cell in the sun50i cpufreq driver: ================================================================== BUG: KASAN: slab-out-of-bounds in sun50i_cpufreq_nvmem_probe+0x180/0x3d4 Read of size 4 at addr ffff000006bf31e0 by task kworker/u16:1/38 This is because the DT specifies the nvmem cell as covering only two bytes, but we use a u32 pointer to read the value. DTs for other SoCs indeed specify 4 bytes, so we cannot just shorten the variable to a u16. Fortunately nvmem_cell_read() allows to return the length of the nvmem cell, in bytes, so we can use that information to only access the valid portion of the data. To cover multiple cell sizes, use memcpy() to copy the information into a zeroed u32 buffer, then also make sure we always read the data in little endian fashion, as this is how the data is stored in the SID efuses. Fixes: 6cc4bcceff9a ("cpufreq: sun50i: Refactor speed bin decoding") Reported-by: Jernej Skrabec Signed-off-by: Andre Przywara Reviewed-by: Jernej Škrabec Signed-off-by: Viresh Kumar --- drivers/cpufreq/sun50i-cpufreq-nvmem.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/sun50i-cpufreq-nvmem.c b/drivers/cpufreq/sun50i-cpufreq-nvmem.c index 47d6840b3489..744312a44279 100644 --- a/drivers/cpufreq/sun50i-cpufreq-nvmem.c +++ b/drivers/cpufreq/sun50i-cpufreq-nvmem.c @@ -194,7 +194,9 @@ static int sun50i_cpufreq_get_efuse(void) struct nvmem_cell *speedbin_nvmem; const struct of_device_id *match; struct device *cpu_dev; - u32 *speedbin; + void *speedbin_ptr; + u32 speedbin = 0; + size_t len; int ret; cpu_dev = get_cpu_device(0); @@ -217,14 +219,18 @@ static int sun50i_cpufreq_get_efuse(void) return dev_err_probe(cpu_dev, PTR_ERR(speedbin_nvmem), "Could not get nvmem cell\n"); - speedbin = nvmem_cell_read(speedbin_nvmem, NULL); + speedbin_ptr = nvmem_cell_read(speedbin_nvmem, &len); nvmem_cell_put(speedbin_nvmem); - if (IS_ERR(speedbin)) - return PTR_ERR(speedbin); + if (IS_ERR(speedbin_ptr)) + return PTR_ERR(speedbin_ptr); - ret = opp_data->efuse_xlate(*speedbin); + if (len <= 4) + memcpy(&speedbin, speedbin_ptr, len); + speedbin = le32_to_cpu(speedbin); - kfree(speedbin); + ret = opp_data->efuse_xlate(speedbin); + + kfree(speedbin_ptr); return ret; }; -- cgit v1.2.3 From fc5414a4774e14e51a93499a6adfdc45f2de82e0 Mon Sep 17 00:00:00 2001 From: Pengyu Luo Date: Sat, 5 Apr 2025 00:42:19 +0800 Subject: cpufreq: Add SM8650 to cpufreq-dt-platdev blocklist SM8650 have already been supported by qcom-cpufreq-hw driver, but never been added to cpufreq-dt-platdev. This makes noise [ 0.388525] cpufreq-dt cpufreq-dt: failed register driver: -17 [ 0.388537] cpufreq-dt cpufreq-dt: probe with driver cpufreq-dt failed with error -17 So adding it to the cpufreq-dt-platdev driver's blocklist to fix it. Signed-off-by: Pengyu Luo Signed-off-by: Viresh Kumar --- drivers/cpufreq/cpufreq-dt-platdev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/cpufreq/cpufreq-dt-platdev.c b/drivers/cpufreq/cpufreq-dt-platdev.c index 2aa00769cf09..a010da0f6337 100644 --- a/drivers/cpufreq/cpufreq-dt-platdev.c +++ b/drivers/cpufreq/cpufreq-dt-platdev.c @@ -175,6 +175,7 @@ static const struct of_device_id blocklist[] __initconst = { { .compatible = "qcom,sm8350", }, { .compatible = "qcom,sm8450", }, { .compatible = "qcom,sm8550", }, + { .compatible = "qcom,sm8650", }, { .compatible = "st,stih407", }, { .compatible = "st,stih410", }, -- cgit v1.2.3 From d4f610a9bafdec8e3210789aa19335367da696ea Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 4 Apr 2025 14:40:06 +0200 Subject: cpufreq: Do not enable by default during compile testing Enabling the compile test should not cause automatic enabling of all drivers. Signed-off-by: Krzysztof Kozlowski Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index 4f9cb943d945..d4d625ded285 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -76,7 +76,7 @@ config ARM_VEXPRESS_SPC_CPUFREQ config ARM_BRCMSTB_AVS_CPUFREQ tristate "Broadcom STB AVS CPUfreq driver" depends on (ARCH_BRCMSTB && !ARM_SCMI_CPUFREQ) || COMPILE_TEST - default y + default ARCH_BRCMSTB help Some Broadcom STB SoCs use a co-processor running proprietary firmware ("AVS") to handle voltage and frequency scaling. This driver provides @@ -181,7 +181,7 @@ config ARM_RASPBERRYPI_CPUFREQ config ARM_S3C64XX_CPUFREQ bool "Samsung S3C64XX" depends on CPU_S3C6410 || COMPILE_TEST - default y + default CPU_S3C6410 help This adds the CPUFreq driver for Samsung S3C6410 SoC. @@ -190,7 +190,7 @@ config ARM_S3C64XX_CPUFREQ config ARM_S5PV210_CPUFREQ bool "Samsung S5PV210 and S5PC110" depends on CPU_S5PV210 || COMPILE_TEST - default y + default CPU_S5PV210 help This adds the CPUFreq driver for Samsung S5PV210 and S5PC110 SoCs. @@ -214,7 +214,7 @@ config ARM_SCMI_CPUFREQ config ARM_SPEAR_CPUFREQ bool "SPEAr CPUFreq support" depends on PLAT_SPEAR || COMPILE_TEST - default y + default PLAT_SPEAR help This adds the CPUFreq driver support for SPEAr SOCs. @@ -233,7 +233,7 @@ config ARM_TEGRA20_CPUFREQ tristate "Tegra20/30 CPUFreq support" depends on ARCH_TEGRA || COMPILE_TEST depends on CPUFREQ_DT - default y + default ARCH_TEGRA help This adds the CPUFreq driver support for Tegra20/30 SOCs. @@ -241,7 +241,7 @@ config ARM_TEGRA124_CPUFREQ bool "Tegra124 CPUFreq support" depends on ARCH_TEGRA || COMPILE_TEST depends on CPUFREQ_DT - default y + default ARCH_TEGRA help This adds the CPUFreq driver support for Tegra124 SOCs. @@ -256,14 +256,14 @@ config ARM_TEGRA194_CPUFREQ tristate "Tegra194 CPUFreq support" depends on ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC || (64BIT && COMPILE_TEST) depends on TEGRA_BPMP - default y + default ARCH_TEGRA help This adds CPU frequency driver support for Tegra194 SOCs. config ARM_TI_CPUFREQ bool "Texas Instruments CPUFreq support" depends on ARCH_OMAP2PLUS || ARCH_K3 || COMPILE_TEST - default y + default ARCH_OMAP2PLUS || ARCH_K3 help This driver enables valid OPPs on the running platform based on values contained within the SoC in use. Enable this in order to -- cgit v1.2.3 From fe81536af3978f26a1383e4da7f135b973eb4209 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Mon, 31 Mar 2025 12:47:07 +0200 Subject: landlock: Remove incorrect warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit landlock_put_hierarchy() can be called when an error occurs in landlock_merge_ruleset() due to insufficient memory. In this case, the domain's audit details might not have been allocated yet, which would cause landlock_free_hierarchy_details() to print a warning (but still safely handle this case). We could keep the WARN_ON_ONCE(!hierarchy) but it's not worth it for this kind of function, so let's remove it entirely. Cc: Paul Moore Reported-by: syzbot+8bca99e91de7e060e4ea@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/20250331104709.897062-1-mic@digikod.net Reviewed-by: Günther Noack Signed-off-by: Mickaël Salaün --- security/landlock/domain.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/landlock/domain.h b/security/landlock/domain.h index ed0d348e214c..7fb70b25f85a 100644 --- a/security/landlock/domain.h +++ b/security/landlock/domain.h @@ -130,7 +130,7 @@ int landlock_init_hierarchy_log(struct landlock_hierarchy *const hierarchy); static inline void landlock_free_hierarchy_details(struct landlock_hierarchy *const hierarchy) { - if (WARN_ON_ONCE(!hierarchy || !hierarchy->details)) + if (!hierarchy || !hierarchy->details) return; put_pid(hierarchy->details->pid); -- cgit v1.2.3 From 05a2b0011c4b6cbbc9b577f6abebe4e9333b0cf6 Mon Sep 17 00:00:00 2001 From: Lukas Fischer Date: Fri, 4 Apr 2025 14:51:51 +0200 Subject: scripts: generate_rust_analyzer: Add ffi crate Commit d072acda4862 ("rust: use custom FFI integer types") did not update rust-analyzer to include the new crate. To enable rust-analyzer support for these custom ffi types, add the `ffi` crate as a dependency to the `bindings`, `uapi` and `kernel` crates, which all directly depend on it. Fixes: d072acda4862 ("rust: use custom FFI integer types") Signed-off-by: Lukas Fischer Reviewed-by: Tamir Duberstein Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20250404125150.85783-2-kernel@o1oo11oo.de Signed-off-by: Miguel Ojeda --- scripts/generate_rust_analyzer.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scripts/generate_rust_analyzer.py b/scripts/generate_rust_analyzer.py index cd41bc906fbd..fe663dd0c43b 100755 --- a/scripts/generate_rust_analyzer.py +++ b/scripts/generate_rust_analyzer.py @@ -112,6 +112,12 @@ def generate_crates(srctree, objtree, sysroot_src, external_src, cfgs): cfg=["kernel"], ) + append_crate( + "ffi", + srctree / "rust" / "ffi.rs", + ["core", "compiler_builtins"], + ) + def append_crate_with_generated( display_name, deps, @@ -131,9 +137,9 @@ def generate_crates(srctree, objtree, sysroot_src, external_src, cfgs): "exclude_dirs": [], } - append_crate_with_generated("bindings", ["core"]) - append_crate_with_generated("uapi", ["core"]) - append_crate_with_generated("kernel", ["core", "macros", "build_error", "pin_init", "bindings", "uapi"]) + append_crate_with_generated("bindings", ["core", "ffi"]) + append_crate_with_generated("uapi", ["core", "ffi"]) + append_crate_with_generated("kernel", ["core", "macros", "build_error", "pin_init", "ffi", "bindings", "uapi"]) def is_root_crate(build_file, target): try: -- cgit v1.2.3 From 47068309b5777313b6ac84a77d8d10dc7312260a Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 8 Apr 2025 09:50:42 -0700 Subject: sched_ext: Use kvzalloc for large exit_dump allocation Replace kzalloc with kvzalloc for the exit_dump buffer allocation, which can require large contiguous memory depending on the implementation. This change prevents allocation failures by allowing the system to fall back to vmalloc when contiguous memory allocation fails. Since this buffer is only used for debugging purposes, physical memory contiguity is not required, making vmalloc a suitable alternative. Cc: stable@vger.kernel.org Fixes: 07814a9439a3b0 ("sched_ext: Print debug dump after an error exit") Suggested-by: Rik van Riel Signed-off-by: Breno Leitao Acked-by: Andrea Righi Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 66bcd40a28ca..db9af6a3c04f 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -4623,7 +4623,7 @@ unlock: static void free_exit_info(struct scx_exit_info *ei) { - kfree(ei->dump); + kvfree(ei->dump); kfree(ei->msg); kfree(ei->bt); kfree(ei); @@ -4639,7 +4639,7 @@ static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) ei->bt = kcalloc(SCX_EXIT_BT_LEN, sizeof(ei->bt[0]), GFP_KERNEL); ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); - ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); + ei->dump = kvzalloc(exit_dump_len, GFP_KERNEL); if (!ei->bt || !ei->msg || !ei->dump) { free_exit_info(ei); -- cgit v1.2.3 From e776b26e3701945e0d20a11dc30ecd4da98e8d67 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 7 Apr 2025 13:20:04 -1000 Subject: sched_ext: Remove cpu.weight / cpu.idle unimplemented warnings sched_ext generates warnings when cpu.weight / cpu.idle are set to non-default values if the BPF scheduler doesn't implement weight support. These warnings don't provide much value while adding constant annoyance. A BPF scheduler may not implement any particular behavior and there's nothing particularly special about missing cgroup weight support. Drop the warnings. Signed-off-by: Tejun Heo --- kernel/sched/ext.c | 41 +---------------------------------------- 1 file changed, 1 insertion(+), 40 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index db9af6a3c04f..21eaf081d336 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -3899,35 +3899,6 @@ bool scx_can_stop_tick(struct rq *rq) DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); static bool scx_cgroup_enabled; -static bool cgroup_warned_missing_weight; -static bool cgroup_warned_missing_idle; - -static void scx_cgroup_warn_missing_weight(struct task_group *tg) -{ - if (scx_ops_enable_state() == SCX_OPS_DISABLED || - cgroup_warned_missing_weight) - return; - - if ((scx_ops.flags & SCX_OPS_HAS_CGROUP_WEIGHT) || !tg->css.parent) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.weight\n", - scx_ops.name); - cgroup_warned_missing_weight = true; -} - -static void scx_cgroup_warn_missing_idle(struct task_group *tg) -{ - if (!scx_cgroup_enabled || cgroup_warned_missing_idle) - return; - - if (!tg->idle) - return; - - pr_warn("sched_ext: \"%s\" does not implement cgroup cpu.idle\n", - scx_ops.name); - cgroup_warned_missing_idle = true; -} int scx_tg_online(struct task_group *tg) { @@ -3937,8 +3908,6 @@ int scx_tg_online(struct task_group *tg) percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_weight(tg); - if (scx_cgroup_enabled) { if (SCX_HAS_OP(cgroup_init)) { struct scx_cgroup_init_args args = @@ -4076,9 +4045,7 @@ void scx_group_set_weight(struct task_group *tg, unsigned long weight) void scx_group_set_idle(struct task_group *tg, bool idle) { - percpu_down_read(&scx_cgroup_rwsem); - scx_cgroup_warn_missing_idle(tg); - percpu_up_read(&scx_cgroup_rwsem); + /* TODO: Implement ops->cgroup_set_idle() */ } static void scx_cgroup_lock(void) @@ -4272,9 +4239,6 @@ static int scx_cgroup_init(void) percpu_rwsem_assert_held(&scx_cgroup_rwsem); - cgroup_warned_missing_weight = false; - cgroup_warned_missing_idle = false; - /* * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk * cgroups and init, all online cgroups are initialized. @@ -4284,9 +4248,6 @@ static int scx_cgroup_init(void) struct task_group *tg = css_tg(css); struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; - scx_cgroup_warn_missing_weight(tg); - scx_cgroup_warn_missing_idle(tg); - if ((tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) continue; -- cgit v1.2.3 From bc08b15b54b8aadbc8a8f413271c07a3f4bead87 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 8 Apr 2025 08:31:12 -1000 Subject: sched_ext: Mark SCX_OPS_HAS_CGROUP_WEIGHT for deprecation SCX_OPS_HAS_CGROUP_WEIGHT was only used to suppress the missing cgroup weight support warnings. Now that the warnings are removed, the flag doesn't do anything. Mark it for deprecation and remove its usage from scx_flatcg. v2: Actually include the scx_flatcg update. Signed-off-by: Tejun Heo Suggested-and-reviewed-by: Andrea Righi --- kernel/sched/ext.c | 5 ++++- tools/sched_ext/scx_flatcg.bpf.c | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c index 21eaf081d336..fdbf249d1c68 100644 --- a/kernel/sched/ext.c +++ b/kernel/sched/ext.c @@ -163,7 +163,7 @@ enum scx_ops_flags { /* * CPU cgroup support flags */ - SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* cpu.weight */ + SCX_OPS_HAS_CGROUP_WEIGHT = 1LLU << 16, /* DEPRECATED, will be removed on 6.18 */ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | SCX_OPS_ENQ_LAST | @@ -5213,6 +5213,9 @@ static int validate_ops(const struct sched_ext_ops *ops) return -EINVAL; } + if (ops->flags & SCX_OPS_HAS_CGROUP_WEIGHT) + pr_warn("SCX_OPS_HAS_CGROUP_WEIGHT is deprecated and a noop\n"); + return 0; } diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c index 2c720e3ecad5..fdc7170639e6 100644 --- a/tools/sched_ext/scx_flatcg.bpf.c +++ b/tools/sched_ext/scx_flatcg.bpf.c @@ -950,5 +950,5 @@ SCX_OPS_DEFINE(flatcg_ops, .cgroup_move = (void *)fcg_cgroup_move, .init = (void *)fcg_init, .exit = (void *)fcg_exit, - .flags = SCX_OPS_HAS_CGROUP_WEIGHT | SCX_OPS_ENQ_EXITING, + .flags = SCX_OPS_ENQ_EXITING, .name = "flatcg"); -- cgit v1.2.3 From 3c75fff196c35c3bbe8c212ad03db94994130b32 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 7 Apr 2025 20:18:21 +0000 Subject: rust: pin-init: alloc: restrict `impl ZeroableOption` for `Box` to `T: Sized` Similar to what was done for `Zeroable>` in commit df27cef15360 ("rust: init: fix `Zeroable` implementation for `Option>` and `Option>`"), the latest Rust documentation [1] says it guarantees that `transmute::<_, Option>([0u8; size_of::()])` is sound and produces `Option::::None` only in some cases. In particular, it says: `Box` (specifically, only `Box`) when `U: Sized` Thus restrict the `impl` to `Sized`, and use similar wording as in that commit too. Link: https://doc.rust-lang.org/stable/std/option/index.html#representation [1] Signed-off-by: Miguel Ojeda Link: https://github.com/Rust-for-Linux/pin-init/pull/32/commits/a6007cf555e5946bcbfafe93a6468c329078acd8 Fixes: 9b2299af3b92 ("rust: pin-init: add `std` and `alloc` support from the user-space version") Cc: stable@vger.kernel.org [ Adjust mentioned commit to the one from the kernel. - Benno ] Signed-off-by: Benno Lossin Link: https://lore.kernel.org/r/20250407201755.649153-2-benno.lossin@proton.me Signed-off-by: Miguel Ojeda --- rust/pin-init/src/alloc.rs | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/rust/pin-init/src/alloc.rs b/rust/pin-init/src/alloc.rs index e16baa3b434e..5017f57442d8 100644 --- a/rust/pin-init/src/alloc.rs +++ b/rust/pin-init/src/alloc.rs @@ -17,11 +17,9 @@ use crate::{ pub extern crate alloc; -// SAFETY: All zeros is equivalent to `None` (option layout optimization guarantee). -// -// In this case we are allowed to use `T: ?Sized`, since all zeros is the `None` variant and there -// is no problem with a VTABLE pointer being null. -unsafe impl ZeroableOption for Box {} +// SAFETY: All zeros is equivalent to `None` (option layout optimization guarantee: +// ). +unsafe impl ZeroableOption for Box {} /// Smart pointer that can initialize memory in-place. pub trait InPlaceInit: Sized { -- cgit v1.2.3 From 193b5a75744af2669ad7c5f7aa4f9219dc73ca69 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 7 Apr 2025 20:18:29 +0000 Subject: rust: pin-init: use Markdown autolinks in Rust comments "Normal" comments in Rust (`//`) are also formatted in Markdown, like the documentation (`///` and `//!`), see Documentation/rust/coding-guidelines.rst Thus use Markdown autolinks for a couple links that were missing it. It also helps to get proper linking in some software like kitty [1]. Suggested-by: Benno Lossin Link: https://github.com/Rust-for-Linux/pin-init/pull/32#discussion_r2023103712 [1] Signed-off-by: Miguel Ojeda Link: https://github.com/Rust-for-Linux/pin-init/pull/32/commits/dd230d61bf0538281072fbff4bb71efc58f3420c Fixes: 84837cf6fa54 ("rust: pin-init: change examples to the user-space version") Cc: stable@vger.kernel.org [ Change case in title. Reworded commit message. - Benno ] Signed-off-by: Benno Lossin Link: https://lore.kernel.org/r/20250407201755.649153-3-benno.lossin@proton.me Signed-off-by: Miguel Ojeda --- rust/pin-init/examples/pthread_mutex.rs | 2 +- rust/pin-init/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/rust/pin-init/examples/pthread_mutex.rs b/rust/pin-init/examples/pthread_mutex.rs index 9164298c44c0..5ac22f1880d2 100644 --- a/rust/pin-init/examples/pthread_mutex.rs +++ b/rust/pin-init/examples/pthread_mutex.rs @@ -1,6 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 OR MIT -// inspired by https://github.com/nbdd0121/pin-init/blob/trunk/examples/pthread_mutex.rs +// inspired by #![allow(clippy::undocumented_unsafe_blocks)] #![cfg_attr(feature = "alloc", feature(allocator_api))] #[cfg(not(windows))] diff --git a/rust/pin-init/src/lib.rs b/rust/pin-init/src/lib.rs index 05c44514765e..0806c689f693 100644 --- a/rust/pin-init/src/lib.rs +++ b/rust/pin-init/src/lib.rs @@ -1447,7 +1447,7 @@ impl_zeroable! { {} UnsafeCell, // SAFETY: All zeros is equivalent to `None` (option layout optimization guarantee: - // https://doc.rust-lang.org/stable/std/option/index.html#representation). + // ). Option, Option, Option, Option, Option, Option, Option, Option, Option, Option, -- cgit v1.2.3 From c59026c0570a2a97ce2e7d5ae5e9c48fc841542b Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Tue, 18 Mar 2025 23:18:16 +0000 Subject: rust: kbuild: Don't export __pfx symbols With CONFIG_PREFIX_SYMBOLS, objtool adds __pfx prefix symbols to claim the compiler emitted call padding bytes. When CONFIG_X86_KERNEL_IBT is not selected, the symbols are added to individual object files and for Rust objects, they end up being exported, resulting in warnings with CONFIG_GENDWARFKSYMS as the symbols have no debugging information: warning: gendwarfksyms: symbol_print_versions: no information for symbol __pfx_rust_helper_put_task_struct warning: gendwarfksyms: symbol_print_versions: no information for symbol __pfx_rust_helper_task_euid warning: gendwarfksyms: symbol_print_versions: no information for symbol __pfx_rust_helper_readq_relaxed ... Filter out the __pfx prefix from exported symbols similarly to the existing __cfi and __odr_asan prefixes. Signed-off-by: Sami Tolvanen Reviewed-by: Alice Ryhl Cc: stable@vger.kernel.org Fixes: ac61506bf2d1 ("rust: Use gendwarfksyms + extended modversions for CONFIG_MODVERSIONS") Link: https://lore.kernel.org/r/20250318231815.917621-2-samitolvanen@google.com Signed-off-by: Miguel Ojeda --- rust/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rust/Makefile b/rust/Makefile index fa0eea8a9eca..3aca903a7d08 100644 --- a/rust/Makefile +++ b/rust/Makefile @@ -368,7 +368,7 @@ $(obj)/bindings/bindings_helpers_generated.rs: private bindgen_target_extra = ; $(obj)/bindings/bindings_helpers_generated.rs: $(src)/helpers/helpers.c FORCE $(call if_changed_dep,bindgen) -rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && $$3!~/__cfi/ && $$3!~/__odr_asan/ { printf $(2),$$3 }' +rust_exports = $(NM) -p --defined-only $(1) | awk '$$2~/(T|R|D|B)/ && $$3!~/__(pfx|cfi|odr_asan)/ { printf $(2),$$3 }' quiet_cmd_exports = EXPORTS $@ cmd_exports = \ -- cgit v1.2.3 From d7b98ae5221007d3f202746903d4c21c7caf7ea9 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 9 Apr 2025 17:15:42 +0200 Subject: dma/contiguous: avoid warning about unused size_bytes When building with W=1, this variable is unused for configs with CONFIG_CMA_SIZE_SEL_PERCENTAGE=y: kernel/dma/contiguous.c:67:26: error: 'size_bytes' defined but not used [-Werror=unused-const-variable=] Change this to a macro to avoid the warning. Fixes: c64be2bb1c6e ("drivers: add Contiguous Memory Allocator") Signed-off-by: Arnd Bergmann Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250409151557.3890443-1-arnd@kernel.org --- kernel/dma/contiguous.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/dma/contiguous.c b/kernel/dma/contiguous.c index 055da410ac71..8df0dfaaca18 100644 --- a/kernel/dma/contiguous.c +++ b/kernel/dma/contiguous.c @@ -64,8 +64,7 @@ struct cma *dma_contiguous_default_area; * Users, who want to set the size of global CMA area for their system * should use cma= kernel parameter. */ -static const phys_addr_t size_bytes __initconst = - (phys_addr_t)CMA_SIZE_MBYTES * SZ_1M; +#define size_bytes ((phys_addr_t)CMA_SIZE_MBYTES * SZ_1M) static phys_addr_t size_cmdline __initdata = -1; static phys_addr_t base_cmdline __initdata; static phys_addr_t limit_cmdline __initdata; -- cgit v1.2.3 From 87d2de042c602e12230283cd40fa604b881e12f7 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sun, 23 Mar 2025 17:31:08 +0800 Subject: cxl/core: Fix caching dport GPF DVSEC issue Per Table 8-2 in CXL r3.2 section 8.1.1 and CXL r3.2 section 8.1.6, only CXL Downstream switch ports and CXL root ports have GPF DVSEC for CXL Port(DVSEC ID 04h). CXL subsystem has a gpf_dvsec in struct cxl_port which is used to cache the offset of a GPF DVSEC in PCIe configuration space. It will be updated during the first EP attaching to the cxl_port, so the gpf_dvsec can only cache the GPF DVSEC offset of the dport which the first EP is under. Will not have chance to update it during other EPs attaching. That means CXL subsystem will use the same GPF DVSEC offset for all dports under the port, it will be a problem if the GPF DVSEC offset cached in cxl_port is not the right offset for a dport. Moving gpf_dvsec from struct cxl_port to struct cxl_dport, make every cxl dport has their own GPF DVSEC offset caching, and each cxl dport uses its own GPF DVSEC offset for GPF DVSEC accessing. Fixes: a52b6a2c1c99 ("cxl/pci: Support Global Persistent Flush (GPF)") Signed-off-by: Li Ming Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Tested-by: Davidlohr Bueso Link: https://patch.msgid.link/20250323093110.233040-2-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/core.h | 2 +- drivers/cxl/core/pci.c | 16 ++++++++-------- drivers/cxl/core/port.c | 2 +- drivers/cxl/cxl.h | 4 ++-- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/drivers/cxl/core/core.h b/drivers/cxl/core/core.h index 15699299dc11..17b692eb3257 100644 --- a/drivers/cxl/core/core.h +++ b/drivers/cxl/core/core.h @@ -119,7 +119,7 @@ int cxl_port_get_switch_dport_bandwidth(struct cxl_port *port, int cxl_ras_init(void); void cxl_ras_exit(void); -int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port); +int cxl_gpf_port_setup(struct cxl_dport *dport); int cxl_acpi_get_extended_linear_cache_size(struct resource *backing_res, int nid, resource_size_t *size); diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 96fecb799cbc..aab0a505d527 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1128,26 +1128,26 @@ static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) return rc; } -int cxl_gpf_port_setup(struct device *dport_dev, struct cxl_port *port) +int cxl_gpf_port_setup(struct cxl_dport *dport) { struct pci_dev *pdev; - if (!port) + if (!dport) return -EINVAL; - if (!port->gpf_dvsec) { + if (!dport->gpf_dvsec) { int dvsec; - dvsec = cxl_gpf_get_dvsec(dport_dev, true); + dvsec = cxl_gpf_get_dvsec(dport->dport_dev, true); if (!dvsec) return -EINVAL; - port->gpf_dvsec = dvsec; + dport->gpf_dvsec = dvsec; } - pdev = to_pci_dev(dport_dev); - update_gpf_port_dvsec(pdev, port->gpf_dvsec, 1); - update_gpf_port_dvsec(pdev, port->gpf_dvsec, 2); + pdev = to_pci_dev(dport->dport_dev); + update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 1); + update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 2); return 0; } diff --git a/drivers/cxl/core/port.c b/drivers/cxl/core/port.c index 0fd6646c1a2e..726bd4a7de27 100644 --- a/drivers/cxl/core/port.c +++ b/drivers/cxl/core/port.c @@ -1678,7 +1678,7 @@ retry: if (rc && rc != -EBUSY) return rc; - cxl_gpf_port_setup(dport_dev, port); + cxl_gpf_port_setup(dport); /* Any more ports to add between this one and the root? */ if (!dev_is_cxl_root_child(&port->dev)) diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index be8a7dc77719..2d81ccd83916 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -592,7 +592,6 @@ struct cxl_dax_region { * @cdat: Cached CDAT data * @cdat_available: Should a CDAT attribute be available in sysfs * @pci_latency: Upstream latency in picoseconds - * @gpf_dvsec: Cached GPF port DVSEC */ struct cxl_port { struct device dev; @@ -616,7 +615,6 @@ struct cxl_port { } cdat; bool cdat_available; long pci_latency; - int gpf_dvsec; }; /** @@ -664,6 +662,7 @@ struct cxl_rcrb_info { * @regs: Dport parsed register blocks * @coord: access coordinates (bandwidth and latency performance attributes) * @link_latency: calculated PCIe downstream latency + * @gpf_dvsec: Cached GPF port DVSEC */ struct cxl_dport { struct device *dport_dev; @@ -675,6 +674,7 @@ struct cxl_dport { struct cxl_regs regs; struct access_coordinate coord[ACCESS_COORDINATE_MAX]; long link_latency; + int gpf_dvsec; }; /** -- cgit v1.2.3 From 6af941db6a60a27209bdb2da1a3a780574d617fe Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sun, 23 Mar 2025 17:31:09 +0800 Subject: cxl/pci: Update Port GPF timeout only when the first EP attaching update_gpf_port_dvsec() is used to update GPF Phase timeout, if a CXL switch is under a CXL root port, update_gpf_port_dvsec() will be invoked on the CXL root port when each cxl memory device under the CXL switch is attaching. It is enough to be invoked once, others are redundant. When the first EP attaching, it always triggers its ancestor dports to locate their own Port GPF DVSEC. The change is that invoking update_gpf_port_dvsec() on these ancestor dports after ancestor dport locating a Port GPF DVSEC. It guarantees that update_gpf_port_dvsec() is invoked on a dport only happens during the first EP attaching. Signed-off-by: Li Ming Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Tested-by: Davidlohr Bueso Link: https://patch.msgid.link/20250323093110.233040-3-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index aab0a505d527..edbdaf1681e8 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1130,12 +1130,11 @@ static int update_gpf_port_dvsec(struct pci_dev *pdev, int dvsec, int phase) int cxl_gpf_port_setup(struct cxl_dport *dport) { - struct pci_dev *pdev; - if (!dport) return -EINVAL; if (!dport->gpf_dvsec) { + struct pci_dev *pdev; int dvsec; dvsec = cxl_gpf_get_dvsec(dport->dport_dev, true); @@ -1143,11 +1142,10 @@ int cxl_gpf_port_setup(struct cxl_dport *dport) return -EINVAL; dport->gpf_dvsec = dvsec; + pdev = to_pci_dev(dport->dport_dev); + update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 1); + update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 2); } - pdev = to_pci_dev(dport->dport_dev); - update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 1); - update_gpf_port_dvsec(pdev, dport->gpf_dvsec, 2); - return 0; } -- cgit v1.2.3 From 36aace15d9bdcfe6f03e078915067e89719478f5 Mon Sep 17 00:00:00 2001 From: Li Ming Date: Sun, 23 Mar 2025 17:31:10 +0800 Subject: cxl/pci: Drop the parameter is_port of cxl_gpf_get_dvsec() The first parameter of cxl_gpf_get_dvsec() is a struct device, can be used to distinguish if the device is a cxl dport or a cxl pci device by checking the PCIe type of it, so the parameter is_port is unnecessary to cxl_gpf_get_dvsec(), using parameter struct device is enough. Signed-off-by: Li Ming Reviewed-by: Davidlohr Bueso Reviewed-by: Jonathan Cameron Reviewed-by: Dan Williams Tested-by: Davidlohr Bueso Link: https://patch.msgid.link/20250323093110.233040-4-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/pci.c | 12 +++++++++--- drivers/cxl/cxl.h | 2 +- drivers/cxl/pmem.c | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index edbdaf1681e8..3b80e9a76ba8 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -1072,14 +1072,20 @@ int cxl_pci_get_bandwidth(struct pci_dev *pdev, struct access_coordinate *c) #define GPF_TIMEOUT_BASE_MAX 2 #define GPF_TIMEOUT_SCALE_MAX 7 /* 10 seconds */ -u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port) +u16 cxl_gpf_get_dvsec(struct device *dev) { + struct pci_dev *pdev; + bool is_port = true; u16 dvsec; if (!dev_is_pci(dev)) return 0; - dvsec = pci_find_dvsec_capability(to_pci_dev(dev), PCI_VENDOR_ID_CXL, + pdev = to_pci_dev(dev); + if (pci_pcie_type(pdev) == PCI_EXP_TYPE_ENDPOINT) + is_port = false; + + dvsec = pci_find_dvsec_capability(pdev, PCI_VENDOR_ID_CXL, is_port ? CXL_DVSEC_PORT_GPF : CXL_DVSEC_DEVICE_GPF); if (!dvsec) dev_warn(dev, "%s GPF DVSEC not present\n", @@ -1137,7 +1143,7 @@ int cxl_gpf_port_setup(struct cxl_dport *dport) struct pci_dev *pdev; int dvsec; - dvsec = cxl_gpf_get_dvsec(dport->dport_dev, true); + dvsec = cxl_gpf_get_dvsec(dport->dport_dev); if (!dvsec) return -EINVAL; diff --git a/drivers/cxl/cxl.h b/drivers/cxl/cxl.h index 2d81ccd83916..a9ab46eb0610 100644 --- a/drivers/cxl/cxl.h +++ b/drivers/cxl/cxl.h @@ -910,6 +910,6 @@ bool cxl_endpoint_decoder_reset_detected(struct cxl_port *port); #define __mock static #endif -u16 cxl_gpf_get_dvsec(struct device *dev, bool is_port); +u16 cxl_gpf_get_dvsec(struct device *dev); #endif /* __CXL_H__ */ diff --git a/drivers/cxl/pmem.c b/drivers/cxl/pmem.c index d061fe3d2b86..e197883690ef 100644 --- a/drivers/cxl/pmem.c +++ b/drivers/cxl/pmem.c @@ -108,7 +108,7 @@ static void cxl_nvdimm_arm_dirty_shutdown_tracking(struct cxl_nvdimm *cxl_nvd) return; } - if (!cxl_gpf_get_dvsec(cxlds->dev, false)) + if (!cxl_gpf_get_dvsec(cxlds->dev)) return; if (cxl_get_dirty_count(mds, &count)) { -- cgit v1.2.3 From 9992649f6786921873a9b89dafa5e04d8c5fef2b Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Wed, 9 Apr 2025 20:48:13 +0800 Subject: cpufreq: apple-soc: Fix null-ptr-deref in apple_soc_cpufreq_get_rate() cpufreq_cpu_get_raw() can return NULL when the target CPU is not present in the policy->cpus mask. apple_soc_cpufreq_get_rate() does not check for this case, which results in a NULL pointer dereference. Fixes: 6286bbb40576 ("cpufreq: apple-soc: Add new driver to control Apple SoC CPU P-states") Signed-off-by: Henry Martin Signed-off-by: Viresh Kumar --- drivers/cpufreq/apple-soc-cpufreq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/apple-soc-cpufreq.c b/drivers/cpufreq/apple-soc-cpufreq.c index 4994c86feb57..b1d29b7af232 100644 --- a/drivers/cpufreq/apple-soc-cpufreq.c +++ b/drivers/cpufreq/apple-soc-cpufreq.c @@ -134,11 +134,17 @@ static const struct of_device_id apple_soc_cpufreq_of_match[] __maybe_unused = { static unsigned int apple_soc_cpufreq_get_rate(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get_raw(cpu); - struct apple_cpu_priv *priv = policy->driver_data; + struct cpufreq_policy *policy; + struct apple_cpu_priv *priv; struct cpufreq_frequency_table *p; unsigned int pstate; + policy = cpufreq_cpu_get_raw(cpu); + if (unlikely(!policy)) + return 0; + + priv = policy->driver_data; + if (priv->info->cur_pstate_mask) { u32 reg = readl_relaxed(priv->reg_base + APPLE_DVFS_STATUS); -- cgit v1.2.3 From 484d3f15cc6cbaa52541d6259778e715b2c83c54 Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Tue, 8 Apr 2025 23:03:53 +0800 Subject: cpufreq: scmi: Fix null-ptr-deref in scmi_cpufreq_get_rate() cpufreq_cpu_get_raw() can return NULL when the target CPU is not present in the policy->cpus mask. scmi_cpufreq_get_rate() does not check for this case, which results in a NULL pointer dereference. Add NULL check after cpufreq_cpu_get_raw() to prevent this issue. Fixes: 99d6bdf33877 ("cpufreq: add support for CPU DVFS based on SCMI message protocol") Signed-off-by: Henry Martin Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- drivers/cpufreq/scmi-cpufreq.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/scmi-cpufreq.c b/drivers/cpufreq/scmi-cpufreq.c index c310aeebc8f3..944e899eb1be 100644 --- a/drivers/cpufreq/scmi-cpufreq.c +++ b/drivers/cpufreq/scmi-cpufreq.c @@ -37,11 +37,17 @@ static struct cpufreq_driver scmi_cpufreq_driver; static unsigned int scmi_cpufreq_get_rate(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get_raw(cpu); - struct scmi_data *priv = policy->driver_data; + struct cpufreq_policy *policy; + struct scmi_data *priv; unsigned long rate; int ret; + policy = cpufreq_cpu_get_raw(cpu); + if (unlikely(!policy)) + return 0; + + priv = policy->driver_data; + ret = perf_ops->freq_get(ph, priv->domain_id, &rate, false); if (ret) return 0; -- cgit v1.2.3 From 73b24dc731731edf762f9454552cb3a5b7224949 Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Tue, 8 Apr 2025 23:03:54 +0800 Subject: cpufreq: scpi: Fix null-ptr-deref in scpi_cpufreq_get_rate() cpufreq_cpu_get_raw() can return NULL when the target CPU is not present in the policy->cpus mask. scpi_cpufreq_get_rate() does not check for this case, which results in a NULL pointer dereference. Fixes: 343a8d17fa8d ("cpufreq: scpi: remove arm_big_little dependency") Signed-off-by: Henry Martin Acked-by: Sudeep Holla Signed-off-by: Viresh Kumar --- drivers/cpufreq/scpi-cpufreq.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/cpufreq/scpi-cpufreq.c b/drivers/cpufreq/scpi-cpufreq.c index 17cda84f00df..dcbb0ae7dd47 100644 --- a/drivers/cpufreq/scpi-cpufreq.c +++ b/drivers/cpufreq/scpi-cpufreq.c @@ -29,9 +29,16 @@ static struct scpi_ops *scpi_ops; static unsigned int scpi_cpufreq_get_rate(unsigned int cpu) { - struct cpufreq_policy *policy = cpufreq_cpu_get_raw(cpu); - struct scpi_data *priv = policy->driver_data; - unsigned long rate = clk_get_rate(priv->clk); + struct cpufreq_policy *policy; + struct scpi_data *priv; + unsigned long rate; + + policy = cpufreq_cpu_get_raw(cpu); + if (unlikely(!policy)) + return 0; + + priv = policy->driver_data; + rate = clk_get_rate(priv->clk); return rate / 1000; } -- cgit v1.2.3 From dcdae6e92d4e062da29235fe88980604595e3f0f Mon Sep 17 00:00:00 2001 From: Maíra Canal Date: Wed, 9 Apr 2025 17:50:06 -0300 Subject: drm/v3d: Fix Indirect Dispatch configuration for V3D 7.1.6 and later MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This commit is a resubmission of commit 1fe1c66274fb ("drm/v3d: Fix Indirect Dispatch configuration for V3D 7.1.6 and later"), which was accidentally reverted by commit 91dae758bdb8 ("Merge tag 'drm-misc-next-2024-08-01' of https://gitlab.freedesktop.org/drm/misc/kernel into drm-next"), likely due to an unfortunate conflict resolution. From the original commit message: ``` `args->cfg[4]` is configured in Indirect Dispatch using the number of batches. Currently, for all V3D tech versions, `args->cfg[4]` equals the number of batches subtracted by 1. But, for V3D 7.1.6 and later, we must not subtract 1 from the number of batches. Implement the fix by checking the V3D tech version and revision. Fixes several `dEQP-VK.synchronization*` CTS tests related to Indirect Dispatch. ``` Fixes: 91dae758bdb8 ("Merge tag 'drm-misc-next-2024-08-01' of https://gitlab.freedesktop.org/drm/misc/kernel into drm-next") Signed-off-by: Maíra Canal Reviewed-by: Iago Toral Quiroga Link: https://lore.kernel.org/r/20250409205051.9639-1-mcanal@igalia.com --- drivers/gpu/drm/v3d/v3d_sched.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 34c42d6e12cd..4a7701a33cf8 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -428,7 +428,8 @@ v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job) struct v3d_bo *bo = to_v3d_bo(job->base.bo[0]); struct v3d_bo *indirect = to_v3d_bo(indirect_csd->indirect); struct drm_v3d_submit_csd *args = &indirect_csd->job->args; - u32 *wg_counts; + struct v3d_dev *v3d = job->base.v3d; + u32 num_batches, *wg_counts; v3d_get_bo_vaddr(bo); v3d_get_bo_vaddr(indirect); @@ -441,8 +442,17 @@ v3d_rewrite_csd_job_wg_counts_from_indirect(struct v3d_cpu_job *job) args->cfg[0] = wg_counts[0] << V3D_CSD_CFG012_WG_COUNT_SHIFT; args->cfg[1] = wg_counts[1] << V3D_CSD_CFG012_WG_COUNT_SHIFT; args->cfg[2] = wg_counts[2] << V3D_CSD_CFG012_WG_COUNT_SHIFT; - args->cfg[4] = DIV_ROUND_UP(indirect_csd->wg_size, 16) * - (wg_counts[0] * wg_counts[1] * wg_counts[2]) - 1; + + num_batches = DIV_ROUND_UP(indirect_csd->wg_size, 16) * + (wg_counts[0] * wg_counts[1] * wg_counts[2]); + + /* V3D 7.1.6 and later don't subtract 1 from the number of batches */ + if (v3d->ver < 71 || (v3d->ver == 71 && v3d->rev < 6)) + args->cfg[4] = num_batches - 1; + else + args->cfg[4] = num_batches; + + WARN_ON(args->cfg[4] == ~0); for (int i = 0; i < 3; i++) { /* 0xffffffff indicates that the uniform rewrite is not needed */ -- cgit v1.2.3 From bcaa391e177c06a58c5f3cd18484a317d40239aa Mon Sep 17 00:00:00 2001 From: Jun Nie Date: Mon, 3 Mar 2025 23:14:30 +0800 Subject: drm/msm/dpu: check every pipe per capability The capability stored in sblk and pipe_hw_caps is checked only for SSPP of the first pipe in the pair with current implementation. That of the 2nd pipe, r_pipe, is not checked and may violate hardware capability. Move requirement check to dpu_plane_atomic_check_pipe() for the check of every pipe. Fixes: ("dbbf57dfd04e6 drm/msm/dpu: split dpu_plane_atomic_check()") Signed-off-by: Jun Nie Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/640513/ Link: https://lore.kernel.org/r/20250303-sm8650-v6-14-hmd-deckard-mdss-quad-upstream-oldbootwrapper-36-prep-v8-1-eb5df105c807@linaro.org Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 71 ++++++++++++++++--------------- 1 file changed, 36 insertions(+), 35 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index af3e541f60c3..aeb90c287245 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -729,12 +729,40 @@ static int dpu_plane_check_inline_rotation(struct dpu_plane *pdpu, static int dpu_plane_atomic_check_pipe(struct dpu_plane *pdpu, struct dpu_sw_pipe *pipe, struct dpu_sw_pipe_cfg *pipe_cfg, - const struct msm_format *fmt, - const struct drm_display_mode *mode) + const struct drm_display_mode *mode, + struct drm_plane_state *new_plane_state) { uint32_t min_src_size; struct dpu_kms *kms = _dpu_plane_get_kms(&pdpu->base); int ret; + const struct msm_format *fmt; + uint32_t supported_rotations; + const struct dpu_sspp_cfg *pipe_hw_caps; + const struct dpu_sspp_sub_blks *sblk; + + pipe_hw_caps = pipe->sspp->cap; + sblk = pipe->sspp->cap->sblk; + + /* + * We already have verified scaling against platform limitations. + * Now check if the SSPP supports scaling at all. + */ + if (!sblk->scaler_blk.len && + ((drm_rect_width(&new_plane_state->src) >> 16 != + drm_rect_width(&new_plane_state->dst)) || + (drm_rect_height(&new_plane_state->src) >> 16 != + drm_rect_height(&new_plane_state->dst)))) + return -ERANGE; + + fmt = msm_framebuffer_format(new_plane_state->fb); + + supported_rotations = DRM_MODE_REFLECT_MASK | DRM_MODE_ROTATE_0; + + if (pipe_hw_caps->features & BIT(DPU_SSPP_INLINE_ROTATION)) + supported_rotations |= DRM_MODE_ROTATE_90; + + pipe_cfg->rotation = drm_rotation_simplify(new_plane_state->rotation, + supported_rotations); min_src_size = MSM_FORMAT_IS_YUV(fmt) ? 2 : 1; @@ -923,47 +951,20 @@ static int dpu_plane_atomic_check_sspp(struct drm_plane *plane, struct dpu_plane_state *pstate = to_dpu_plane_state(new_plane_state); struct dpu_sw_pipe *pipe = &pstate->pipe; struct dpu_sw_pipe *r_pipe = &pstate->r_pipe; - const struct msm_format *fmt; struct dpu_sw_pipe_cfg *pipe_cfg = &pstate->pipe_cfg; struct dpu_sw_pipe_cfg *r_pipe_cfg = &pstate->r_pipe_cfg; - uint32_t supported_rotations; - const struct dpu_sspp_cfg *pipe_hw_caps; - const struct dpu_sspp_sub_blks *sblk; int ret = 0; - pipe_hw_caps = pipe->sspp->cap; - sblk = pipe->sspp->cap->sblk; - - /* - * We already have verified scaling against platform limitations. - * Now check if the SSPP supports scaling at all. - */ - if (!sblk->scaler_blk.len && - ((drm_rect_width(&new_plane_state->src) >> 16 != - drm_rect_width(&new_plane_state->dst)) || - (drm_rect_height(&new_plane_state->src) >> 16 != - drm_rect_height(&new_plane_state->dst)))) - return -ERANGE; - - fmt = msm_framebuffer_format(new_plane_state->fb); - - supported_rotations = DRM_MODE_REFLECT_MASK | DRM_MODE_ROTATE_0; - - if (pipe_hw_caps->features & BIT(DPU_SSPP_INLINE_ROTATION)) - supported_rotations |= DRM_MODE_ROTATE_90; - - pipe_cfg->rotation = drm_rotation_simplify(new_plane_state->rotation, - supported_rotations); - r_pipe_cfg->rotation = pipe_cfg->rotation; - - ret = dpu_plane_atomic_check_pipe(pdpu, pipe, pipe_cfg, fmt, - &crtc_state->adjusted_mode); + ret = dpu_plane_atomic_check_pipe(pdpu, pipe, pipe_cfg, + &crtc_state->adjusted_mode, + new_plane_state); if (ret) return ret; if (drm_rect_width(&r_pipe_cfg->src_rect) != 0) { - ret = dpu_plane_atomic_check_pipe(pdpu, r_pipe, r_pipe_cfg, fmt, - &crtc_state->adjusted_mode); + ret = dpu_plane_atomic_check_pipe(pdpu, r_pipe, r_pipe_cfg, + &crtc_state->adjusted_mode, + new_plane_state); if (ret) return ret; } -- cgit v1.2.3 From 5cb1b130e1cd04239cc9c26a98279f4660dce583 Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Thu, 13 Mar 2025 20:10:04 -0500 Subject: drm/msm/dpu: Fix error pointers in dpu_plane_virtual_atomic_check The function dpu_plane_virtual_atomic_check was dereferencing pointers returned by drm_atomic_get_plane_state without checking for errors. This could lead to undefined behavior if the function returns an error pointer. This commit adds checks using IS_ERR to ensure that plane_state is valid before dereferencing them. Similar to commit da29abe71e16 ("drm/amd/display: Fix error pointers in amdgpu_dm_crtc_mem_type_changed"). Fixes: 774bcfb73176 ("drm/msm/dpu: add support for virtual planes") Signed-off-by: Chenyuan Yang Reviewed-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/643132/ Link: https://lore.kernel.org/r/20250314011004.663804-1-chenyuan0y@gmail.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c index aeb90c287245..e03d6091f736 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_plane.c @@ -1060,6 +1060,9 @@ static int dpu_plane_virtual_atomic_check(struct drm_plane *plane, struct drm_crtc_state *crtc_state; int ret; + if (IS_ERR(plane_state)) + return PTR_ERR(plane_state); + if (plane_state->crtc) crtc_state = drm_atomic_get_new_crtc_state(state, plane_state->crtc); -- cgit v1.2.3 From 2a34496fef841e8d89a4ccd1a48c7fd664b5c84f Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Tue, 8 Apr 2025 18:22:23 +0100 Subject: drm/msm/dpu: reorder pointer operations after sanity checks to avoid NULL deref _dpu_encoder_trigger_start dereferences "struct dpu_encoder_phys *phys" before the sanity checks which can lead to a NULL pointer dereference if phys is NULL. Fix this by reordering the dereference after the sanity checks. Fixes: 8144d17a81d9 ("drm/msm/dpu: Skip trigger flush and start for CWB") Reviewed-by: Dmitry Baryshkov Signed-off-by: Qasim Ijaz Reviewed-by: Jessica Zhang Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/647536/ Link: https://lore.kernel.org/r/20250408172223.10827-1-qasdev00@gmail.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c index 284e69bb47c1..dd13dc4e95a4 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c +++ b/drivers/gpu/drm/msm/disp/dpu1/dpu_encoder.c @@ -1666,7 +1666,7 @@ static void _dpu_encoder_trigger_flush(struct drm_encoder *drm_enc, */ static void _dpu_encoder_trigger_start(struct dpu_encoder_phys *phys) { - struct dpu_encoder_virt *dpu_enc = to_dpu_encoder_virt(phys->parent); + struct dpu_encoder_virt *dpu_enc; if (!phys) { DPU_ERROR("invalid argument(s)\n"); @@ -1678,6 +1678,8 @@ static void _dpu_encoder_trigger_start(struct dpu_encoder_phys *phys) return; } + dpu_enc = to_dpu_encoder_virt(phys->parent); + if (phys->parent->encoder_type == DRM_MODE_ENCODER_VIRTUAL && dpu_enc->cwb_mask) { DPU_DEBUG("encoder %d CWB enabled, skipping\n", DRMID(phys->parent)); -- cgit v1.2.3 From ddfa00afae800b3dea02fa36f3f4012a8379ae58 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Tue, 8 Apr 2025 16:02:44 +0300 Subject: drm/msm/dpu: drop rogue intr_tear_rd_ptr values The commit 5a9d50150c2c ("drm/msm/dpu: shift IRQ indices by 1") shifted IRQ indices by 1, making 'NO_IRQ' to be 0 rather than -1 (and allowing to skip the definition if the IRQ is not present). Several platform files were sketched before that commit, but got applied afterwards. As such, they inherited historical (and currently incorrect) setting of .intr_tear_rd_ptr = -1 for 'NO_IRQ' value. Drop that setting for all the affected platforms. Fixes: 62af6e1cb596 ("drm/msm/dpu: Add support for MSM8917") Fixes: c079680bb0fa ("drm/msm/dpu: Add support for MSM8937") Fixes: 7a6109ce1c2c ("drm/msm/dpu: Add support for MSM8953") Fixes: daf9a92daeb8 ("drm/msm/dpu: Add support for MSM8996") Fixes: 7204df5e7e68 ("drm/msm/dpu: add support for SDM660 and SDM630 platforms") Signed-off-by: Dmitry Baryshkov Reviewed-by: Abhinav Kumar Patchwork: https://patchwork.freedesktop.org/patch/647486/ Link: https://lore.kernel.org/r/20250408-dpu-drop-intr-rd-ptr-v1-1-eeac337d88f8@oss.qualcomm.com Signed-off-by: Abhinav Kumar --- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_14_msm8937.h | 2 -- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_15_msm8917.h | 1 - drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_16_msm8953.h | 3 --- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_7_msm8996.h | 4 ---- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h | 3 --- drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h | 2 -- 6 files changed, 15 deletions(-) diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_14_msm8937.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_14_msm8937.h index 1f32807bb5e5..ad60089f18ea 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_14_msm8937.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_14_msm8937.h @@ -132,7 +132,6 @@ static const struct dpu_intf_cfg msm8937_intf[] = { .prog_fetch_lines_worst_case = 14, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 26), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 27), - .intr_tear_rd_ptr = -1, }, { .name = "intf_2", .id = INTF_2, .base = 0x6b000, .len = 0x268, @@ -141,7 +140,6 @@ static const struct dpu_intf_cfg msm8937_intf[] = { .prog_fetch_lines_worst_case = 14, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 28), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 29), - .intr_tear_rd_ptr = -1, }, }; diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_15_msm8917.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_15_msm8917.h index 42131959ff22..a1cf89a0a42d 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_15_msm8917.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_15_msm8917.h @@ -118,7 +118,6 @@ static const struct dpu_intf_cfg msm8917_intf[] = { .prog_fetch_lines_worst_case = 14, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 26), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 27), - .intr_tear_rd_ptr = -1, }, }; diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_16_msm8953.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_16_msm8953.h index 2b4723a5c676..eea9b80e2287 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_16_msm8953.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_16_msm8953.h @@ -131,7 +131,6 @@ static const struct dpu_intf_cfg msm8953_intf[] = { .prog_fetch_lines_worst_case = 14, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 24), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 25), - .intr_tear_rd_ptr = -1, }, { .name = "intf_1", .id = INTF_1, .base = 0x6a800, .len = 0x268, @@ -140,7 +139,6 @@ static const struct dpu_intf_cfg msm8953_intf[] = { .prog_fetch_lines_worst_case = 14, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 26), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 27), - .intr_tear_rd_ptr = -1, }, { .name = "intf_2", .id = INTF_2, .base = 0x6b000, .len = 0x268, @@ -149,7 +147,6 @@ static const struct dpu_intf_cfg msm8953_intf[] = { .prog_fetch_lines_worst_case = 14, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 28), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 29), - .intr_tear_rd_ptr = -1, }, }; diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_7_msm8996.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_7_msm8996.h index 5cf19de71f06..ae18a354e5d2 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_7_msm8996.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_1_7_msm8996.h @@ -241,7 +241,6 @@ static const struct dpu_intf_cfg msm8996_intf[] = { .prog_fetch_lines_worst_case = 25, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 24), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 25), - .intr_tear_rd_ptr = -1, }, { .name = "intf_1", .id = INTF_1, .base = 0x6a800, .len = 0x268, @@ -250,7 +249,6 @@ static const struct dpu_intf_cfg msm8996_intf[] = { .prog_fetch_lines_worst_case = 25, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 26), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 27), - .intr_tear_rd_ptr = -1, }, { .name = "intf_2", .id = INTF_2, .base = 0x6b000, .len = 0x268, @@ -259,7 +257,6 @@ static const struct dpu_intf_cfg msm8996_intf[] = { .prog_fetch_lines_worst_case = 25, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 28), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 29), - .intr_tear_rd_ptr = -1, }, { .name = "intf_3", .id = INTF_3, .base = 0x6b800, .len = 0x268, @@ -267,7 +264,6 @@ static const struct dpu_intf_cfg msm8996_intf[] = { .prog_fetch_lines_worst_case = 25, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 30), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 31), - .intr_tear_rd_ptr = -1, }, }; diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h index 4f2f68b07f20..bb89da0a481d 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_2_sdm660.h @@ -202,7 +202,6 @@ static const struct dpu_intf_cfg sdm660_intf[] = { .prog_fetch_lines_worst_case = 21, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 24), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 25), - .intr_tear_rd_ptr = -1, }, { .name = "intf_1", .id = INTF_1, .base = 0x6a800, .len = 0x280, @@ -211,7 +210,6 @@ static const struct dpu_intf_cfg sdm660_intf[] = { .prog_fetch_lines_worst_case = 21, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 26), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 27), - .intr_tear_rd_ptr = -1, }, { .name = "intf_2", .id = INTF_2, .base = 0x6b000, .len = 0x280, @@ -220,7 +218,6 @@ static const struct dpu_intf_cfg sdm660_intf[] = { .prog_fetch_lines_worst_case = 21, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 28), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 29), - .intr_tear_rd_ptr = -1, }, }; diff --git a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h index c70bef025ac4..7caf876ca3e3 100644 --- a/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h +++ b/drivers/gpu/drm/msm/disp/dpu1/catalog/dpu_3_3_sdm630.h @@ -147,7 +147,6 @@ static const struct dpu_intf_cfg sdm630_intf[] = { .prog_fetch_lines_worst_case = 21, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 24), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 25), - .intr_tear_rd_ptr = -1, }, { .name = "intf_1", .id = INTF_1, .base = 0x6a800, .len = 0x280, @@ -156,7 +155,6 @@ static const struct dpu_intf_cfg sdm630_intf[] = { .prog_fetch_lines_worst_case = 21, .intr_underrun = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 26), .intr_vsync = DPU_IRQ_IDX(MDP_SSPP_TOP0_INTR, 27), - .intr_tear_rd_ptr = -1, }, }; -- cgit v1.2.3 From 683e9fa1c885a0cffbc10b459a7eee9df92af1c1 Mon Sep 17 00:00:00 2001 From: Maciej Falkowski Date: Tue, 1 Apr 2025 17:57:55 +0200 Subject: accel/ivpu: Flush pending jobs of device's workqueues Use flush_work() instead of cancel_work_sync() for driver IRQ workqueues to guarantee that remaining pending work will be handled. This resolves two issues that were encountered where a driver was left in an incorrect state as the bottom-half was canceled: 1. Cancelling context-abort of a job that is still executing and is causing translation faults which is going to cause additional TDRs 2. Cancelling bottom-half of a DCT (duty-cycle throttling) request which will cause a device to not be adjusted to an external frequency request. Fixes: bc3e5f48b7ee ("accel/ivpu: Use workqueue for IRQ handling") Signed-off-by: Maciej Falkowski Reviewed-by: Lizhi Hou Reviewed-by: Jeff Hugo Signed-off-by: Jacek Lawrynowicz Link: https://lore.kernel.org/r/20250401155755.4049156-1-maciej.falkowski@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 4fa73189502e..5e3888ff1022 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -421,9 +421,9 @@ void ivpu_prepare_for_reset(struct ivpu_device *vdev) { ivpu_hw_irq_disable(vdev); disable_irq(vdev->irq); - cancel_work_sync(&vdev->irq_ipc_work); - cancel_work_sync(&vdev->irq_dct_work); - cancel_work_sync(&vdev->context_abort_work); + flush_work(&vdev->irq_ipc_work); + flush_work(&vdev->irq_dct_work); + flush_work(&vdev->context_abort_work); ivpu_ipc_disable(vdev); ivpu_mmu_disable(vdev); } -- cgit v1.2.3 From 082a29e20af43455bc130b14c7426ace6a143819 Mon Sep 17 00:00:00 2001 From: Karol Wachowski Date: Tue, 1 Apr 2025 17:58:17 +0200 Subject: accel/ivpu: Update FW Boot API to version 3.28.3 Update firmware Boot API to 3.28.3 version and adjust driver to API changes for preemption buffers. Use new preemption buffer size fields from FW header added to firmware boot API for preemption buffers allocations, if those new fields are zeroed, use old values instead. Signed-off-by: Karol Wachowski Signed-off-by: Maciej Falkowski Reviewed-by: Jeff Hugo Signed-off-by: Jacek Lawrynowicz Link: https://lore.kernel.org/r/20250401155817.4049220-1-maciej.falkowski@linux.intel.com --- drivers/accel/ivpu/ivpu_fw.c | 14 +++++++++-- drivers/accel/ivpu/vpu_boot_api.h | 13 +++++++--- drivers/accel/ivpu/vpu_jsm_api.h | 53 ++++++++++++++++++++++++++------------- 3 files changed, 58 insertions(+), 22 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_fw.c b/drivers/accel/ivpu/ivpu_fw.c index 7a1bb92d8c81..3799231b39e7 100644 --- a/drivers/accel/ivpu/ivpu_fw.c +++ b/drivers/accel/ivpu/ivpu_fw.c @@ -233,10 +233,20 @@ static int ivpu_fw_parse(struct ivpu_device *vdev) fw->dvfs_mode = 0; fw->sched_mode = ivpu_fw_sched_mode_select(vdev, fw_hdr); - fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size; - fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size; ivpu_info(vdev, "Scheduler mode: %s\n", fw->sched_mode ? "HW" : "OS"); + if (fw_hdr->preemption_buffer_1_max_size) + fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_max_size; + else + fw->primary_preempt_buf_size = fw_hdr->preemption_buffer_1_size; + + if (fw_hdr->preemption_buffer_2_max_size) + fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_max_size; + else + fw->secondary_preempt_buf_size = fw_hdr->preemption_buffer_2_size; + ivpu_dbg(vdev, FW_BOOT, "Preemption buffer sizes: primary %u, secondary %u\n", + fw->primary_preempt_buf_size, fw->secondary_preempt_buf_size); + if (fw_hdr->ro_section_start_address && !is_within_range(fw_hdr->ro_section_start_address, fw_hdr->ro_section_size, fw_hdr->image_load_address, diff --git a/drivers/accel/ivpu/vpu_boot_api.h b/drivers/accel/ivpu/vpu_boot_api.h index 908e68ea1c39..218468bbbcad 100644 --- a/drivers/accel/ivpu/vpu_boot_api.h +++ b/drivers/accel/ivpu/vpu_boot_api.h @@ -26,7 +26,7 @@ * Minor version changes when API backward compatibility is preserved. * Resets to 0 if Major version is incremented. */ -#define VPU_BOOT_API_VER_MINOR 26 +#define VPU_BOOT_API_VER_MINOR 28 /* * API header changed (field names, documentation, formatting) but API itself has not been changed @@ -76,8 +76,15 @@ struct vpu_firmware_header { * submission queue size and device capabilities. */ u32 preemption_buffer_2_size; + /* + * Maximum preemption buffer size that the FW can use: no need for the host + * driver to allocate more space than that specified by these fields. + * A value of 0 means no declared limit. + */ + u32 preemption_buffer_1_max_size; + u32 preemption_buffer_2_max_size; /* Space reserved for future preemption-related fields. */ - u32 preemption_reserved[6]; + u32 preemption_reserved[4]; /* FW image read only section start address, 4KB aligned */ u64 ro_section_start_address; /* FW image read only section size, 4KB aligned */ @@ -134,7 +141,7 @@ enum vpu_trace_destination { /* * Processor bit shifts (for loggable HW components). */ -#define VPU_TRACE_PROC_BIT_ARM 0 +#define VPU_TRACE_PROC_BIT_RESERVED 0 #define VPU_TRACE_PROC_BIT_LRT 1 #define VPU_TRACE_PROC_BIT_LNN 2 #define VPU_TRACE_PROC_BIT_SHV_0 3 diff --git a/drivers/accel/ivpu/vpu_jsm_api.h b/drivers/accel/ivpu/vpu_jsm_api.h index 7215c144158c..4b6b2b3d2583 100644 --- a/drivers/accel/ivpu/vpu_jsm_api.h +++ b/drivers/accel/ivpu/vpu_jsm_api.h @@ -22,7 +22,7 @@ /* * Minor version changes when API backward compatibility is preserved. */ -#define VPU_JSM_API_VER_MINOR 25 +#define VPU_JSM_API_VER_MINOR 29 /* * API header changed (field names, documentation, formatting) but API itself has not been changed @@ -53,8 +53,7 @@ * Engine indexes. */ #define VPU_ENGINE_COMPUTE 0 -#define VPU_ENGINE_COPY 1 -#define VPU_ENGINE_NB 2 +#define VPU_ENGINE_NB 1 /* * VPU status values. @@ -126,11 +125,13 @@ enum { * When set, indicates that job queue uses native fences (as inline commands * in job queue). Such queues may also use legacy fences (as commands in batch buffers). * When cleared, indicates the job queue only uses legacy fences. - * NOTE: For queues using native fences, VPU expects that all jobs in the queue - * are immediately followed by an inline command object. This object is expected - * to be a fence signal command in most cases, but can also be a NOP in case the host - * does not need per-job fence signalling. Other inline commands objects can be - * inserted between "job and inline command" pairs. + * NOTES: + * 1. For queues using native fences, VPU expects that all jobs in the queue + * are immediately followed by an inline command object. This object is expected + * to be a fence signal command in most cases, but can also be a NOP in case the host + * does not need per-job fence signalling. Other inline commands objects can be + * inserted between "job and inline command" pairs. + * 2. Native fence queues are only supported on VPU 40xx onwards. */ VPU_JOB_QUEUE_FLAGS_USE_NATIVE_FENCE_MASK = (1 << 1U), @@ -275,6 +276,8 @@ struct vpu_inline_cmd { u64 value; /* User VA of the log buffer in which to add log entry on completion. */ u64 log_buffer_va; + /* NPU private data. */ + u64 npu_private_data; } fence; /* Other commands do not have a payload. */ /* Payload definition for future inline commands can be inserted here. */ @@ -791,12 +794,22 @@ struct vpu_jsm_metric_streamer_update { /** Metric group mask that identifies metric streamer instance. */ u64 metric_group_mask; /** - * Address and size of the buffer where the VPU will write metric data. If - * the buffer address is 0 or same as the currently used buffer the VPU will - * continue writing metric data to the current buffer. In this case the - * buffer size is ignored and the size of the current buffer is unchanged. - * If the address is non-zero and differs from the current buffer address the - * VPU will immediately switch data collection to the new buffer. + * Address and size of the buffer where the VPU will write metric data. + * This member dictates how the update operation should perform: + * 1. client needs information about the number of collected samples and the + * amount of data written to the current buffer + * 2. client wants to switch to a new buffer + * + * Case 1. is identified by the buffer address being 0 or the same as the + * currently used buffer address. In this case the buffer size is ignored and + * the size of the current buffer is unchanged. The VPU will return an update + * in the vpu_jsm_metric_streamer_done structure. The internal writing position + * into the buffer is not changed. + * + * Case 2. is identified by the address being non-zero and differs from the + * current buffer address. The VPU will immediately switch data collection to + * the new buffer. Then the VPU will return an update in the + * vpu_jsm_metric_streamer_done structure. */ u64 buffer_addr; u64 buffer_size; @@ -934,6 +947,7 @@ struct vpu_ipc_msg_payload_hws_priority_band_setup { /* * Default quantum in 100ns units for scheduling across processes * within a priority band + * Minimum value supported by NPU is 1ms (10000 in 100ns units). */ u32 process_quantum[VPU_HWS_NUM_PRIORITY_BANDS]; /* @@ -946,8 +960,10 @@ struct vpu_ipc_msg_payload_hws_priority_band_setup { * in situations when it's starved by the focus band. */ u32 normal_band_percentage; - /* Reserved */ - u32 reserved_0; + /* + * TDR timeout value in milliseconds. Default value of 0 meaning no timeout. + */ + u32 tdr_timeout; }; /* @@ -1024,7 +1040,10 @@ struct vpu_ipc_msg_payload_hws_set_context_sched_properties { s32 in_process_priority; /* Zero padding / Reserved */ u32 reserved_1; - /* Context quantum relative to other contexts of same priority in the same process */ + /* + * Context quantum relative to other contexts of same priority in the same process + * Minimum value supported by NPU is 1ms (10000 in 100ns units). + */ u64 context_quantum; /* Grace period when preempting context of the same priority within the same process */ u64 grace_period_same_priority; -- cgit v1.2.3 From 6c2b75404d33caa46a582f2791a70f92232adb71 Mon Sep 17 00:00:00 2001 From: Andrzej Kacprowski Date: Tue, 1 Apr 2025 17:59:11 +0200 Subject: accel/ivpu: Fix the NPU's DPU frequency calculation Fix the frequency returned to the user space by the DRM_IVPU_PARAM_CORE_CLOCK_RATE GET_PARAM IOCTL. The kernel driver returned CPU frequency for MTL and bare PLL frequency for LNL - this was inconsistent and incorrect for both platforms. With this fix the driver returns maximum frequency of the NPU data processing unit (DPU) for all HW generations. This is what user space always expected. Also do not set CPU frequency in boot params - the firmware does not use frequency passed from the driver, it was only used by the early pre-production firmware. With that we can remove CPU frequency calculation code. Show NPU frequency in FREQ_CHANGE interrupt when frequency tracking is enabled. Fixes: 8a27ad81f7d3 ("accel/ivpu: Split IP and buttress code") Cc: stable@vger.kernel.org # v6.11+ Signed-off-by: Andrzej Kacprowski Signed-off-by: Maciej Falkowski Reviewed-by: Jeff Hugo Signed-off-by: Jacek Lawrynowicz Link: https://lore.kernel.org/r/20250401155912.4049340-2-maciej.falkowski@linux.intel.com --- drivers/accel/ivpu/ivpu_drv.c | 4 +- drivers/accel/ivpu/ivpu_fw.c | 3 +- drivers/accel/ivpu/ivpu_hw.h | 11 +--- drivers/accel/ivpu/ivpu_hw_btrs.c | 126 +++++++++++++++++--------------------- drivers/accel/ivpu/ivpu_hw_btrs.h | 6 +- include/uapi/drm/ivpu_accel.h | 4 +- 6 files changed, 66 insertions(+), 88 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_drv.c b/drivers/accel/ivpu/ivpu_drv.c index 5e3888ff1022..eff1d3ca075f 100644 --- a/drivers/accel/ivpu/ivpu_drv.c +++ b/drivers/accel/ivpu/ivpu_drv.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #include @@ -164,7 +164,7 @@ static int ivpu_get_param_ioctl(struct drm_device *dev, void *data, struct drm_f args->value = vdev->platform; break; case DRM_IVPU_PARAM_CORE_CLOCK_RATE: - args->value = ivpu_hw_ratio_to_freq(vdev, vdev->hw->pll.max_ratio); + args->value = ivpu_hw_dpu_max_freq_get(vdev); break; case DRM_IVPU_PARAM_NUM_CONTEXTS: args->value = ivpu_get_context_count(vdev); diff --git a/drivers/accel/ivpu/ivpu_fw.c b/drivers/accel/ivpu/ivpu_fw.c index 3799231b39e7..5e1d709c6a46 100644 --- a/drivers/accel/ivpu/ivpu_fw.c +++ b/drivers/accel/ivpu/ivpu_fw.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #include @@ -576,7 +576,6 @@ void ivpu_fw_boot_params_setup(struct ivpu_device *vdev, struct vpu_boot_params boot_params->magic = VPU_BOOT_PARAMS_MAGIC; boot_params->vpu_id = to_pci_dev(vdev->drm.dev)->bus->number; - boot_params->frequency = ivpu_hw_pll_freq_get(vdev); /* * This param is a debug firmware feature. It switches default clock diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h index 16435f2756d0..6e67e736ef8e 100644 --- a/drivers/accel/ivpu/ivpu_hw.h +++ b/drivers/accel/ivpu/ivpu_hw.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #ifndef __IVPU_HW_H__ @@ -82,9 +82,9 @@ static inline u64 ivpu_hw_range_size(const struct ivpu_addr_range *range) return range->end - range->start; } -static inline u32 ivpu_hw_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) +static inline u32 ivpu_hw_dpu_max_freq_get(struct ivpu_device *vdev) { - return ivpu_hw_btrs_ratio_to_freq(vdev, ratio); + return ivpu_hw_btrs_dpu_max_freq_get(vdev); } static inline void ivpu_hw_irq_clear(struct ivpu_device *vdev) @@ -92,11 +92,6 @@ static inline void ivpu_hw_irq_clear(struct ivpu_device *vdev) ivpu_hw_ip_irq_clear(vdev); } -static inline u32 ivpu_hw_pll_freq_get(struct ivpu_device *vdev) -{ - return ivpu_hw_btrs_pll_freq_get(vdev); -} - static inline u32 ivpu_hw_profiling_freq_get(struct ivpu_device *vdev) { return vdev->hw->pll.profiling_freq; diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.c b/drivers/accel/ivpu/ivpu_hw_btrs.c index 56c56012b980..91eb8a93c15b 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs.c +++ b/drivers/accel/ivpu/ivpu_hw_btrs.c @@ -1,8 +1,10 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ +#include + #include "ivpu_drv.h" #include "ivpu_hw.h" #include "ivpu_hw_btrs.h" @@ -28,17 +30,13 @@ #define BTRS_LNL_ALL_IRQ_MASK ((u32)-1) -#define BTRS_MTL_WP_CONFIG_1_TILE_5_3_RATIO WP_CONFIG(MTL_CONFIG_1_TILE, MTL_PLL_RATIO_5_3) -#define BTRS_MTL_WP_CONFIG_1_TILE_4_3_RATIO WP_CONFIG(MTL_CONFIG_1_TILE, MTL_PLL_RATIO_4_3) -#define BTRS_MTL_WP_CONFIG_2_TILE_5_3_RATIO WP_CONFIG(MTL_CONFIG_2_TILE, MTL_PLL_RATIO_5_3) -#define BTRS_MTL_WP_CONFIG_2_TILE_4_3_RATIO WP_CONFIG(MTL_CONFIG_2_TILE, MTL_PLL_RATIO_4_3) -#define BTRS_MTL_WP_CONFIG_0_TILE_PLL_OFF WP_CONFIG(0, 0) #define PLL_CDYN_DEFAULT 0x80 #define PLL_EPP_DEFAULT 0x80 #define PLL_CONFIG_DEFAULT 0x0 -#define PLL_SIMULATION_FREQ 10000000 -#define PLL_REF_CLK_FREQ 50000000 +#define PLL_REF_CLK_FREQ 50000000ull +#define PLL_RATIO_TO_FREQ(x) ((x) * PLL_REF_CLK_FREQ) + #define PLL_TIMEOUT_US (1500 * USEC_PER_MSEC) #define IDLE_TIMEOUT_US (5 * USEC_PER_MSEC) #define TIMEOUT_US (150 * USEC_PER_MSEC) @@ -62,6 +60,8 @@ #define DCT_ENABLE 0x1 #define DCT_DISABLE 0x0 +static u32 pll_ratio_to_dpu_freq(struct ivpu_device *vdev, u32 ratio); + int ivpu_hw_btrs_irqs_clear_with_0_mtl(struct ivpu_device *vdev) { REGB_WR32(VPU_HW_BTRS_MTL_INTERRUPT_STAT, BTRS_MTL_ALL_IRQ_MASK); @@ -156,7 +156,7 @@ static int info_init_mtl(struct ivpu_device *vdev) hw->tile_fuse = BTRS_MTL_TILE_FUSE_ENABLE_BOTH; hw->sku = BTRS_MTL_TILE_SKU_BOTH; - hw->config = BTRS_MTL_WP_CONFIG_2_TILE_4_3_RATIO; + hw->config = WP_CONFIG(MTL_CONFIG_2_TILE, MTL_PLL_RATIO_4_3); return 0; } @@ -334,8 +334,8 @@ int ivpu_hw_btrs_wp_drive(struct ivpu_device *vdev, bool enable) prepare_wp_request(vdev, &wp, enable); - ivpu_dbg(vdev, PM, "PLL workpoint request: %u Hz, config: 0x%x, epp: 0x%x, cdyn: 0x%x\n", - PLL_RATIO_TO_FREQ(wp.target), wp.cfg, wp.epp, wp.cdyn); + ivpu_dbg(vdev, PM, "PLL workpoint request: %lu MHz, config: 0x%x, epp: 0x%x, cdyn: 0x%x\n", + pll_ratio_to_dpu_freq(vdev, wp.target) / HZ_PER_MHZ, wp.cfg, wp.epp, wp.cdyn); ret = wp_request_send(vdev, &wp); if (ret) { @@ -573,6 +573,39 @@ int ivpu_hw_btrs_wait_for_idle(struct ivpu_device *vdev) return REGB_POLL_FLD(VPU_HW_BTRS_LNL_VPU_STATUS, IDLE, 0x1, IDLE_TIMEOUT_US); } +static u32 pll_config_get_mtl(struct ivpu_device *vdev) +{ + return REGB_RD32(VPU_HW_BTRS_MTL_CURRENT_PLL); +} + +static u32 pll_config_get_lnl(struct ivpu_device *vdev) +{ + return REGB_RD32(VPU_HW_BTRS_LNL_PLL_FREQ); +} + +static u32 pll_ratio_to_dpu_freq_mtl(u16 ratio) +{ + return (PLL_RATIO_TO_FREQ(ratio) * 2) / 3; +} + +static u32 pll_ratio_to_dpu_freq_lnl(u16 ratio) +{ + return PLL_RATIO_TO_FREQ(ratio) / 2; +} + +static u32 pll_ratio_to_dpu_freq(struct ivpu_device *vdev, u32 ratio) +{ + if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) + return pll_ratio_to_dpu_freq_mtl(ratio); + else + return pll_ratio_to_dpu_freq_lnl(ratio); +} + +u32 ivpu_hw_btrs_dpu_max_freq_get(struct ivpu_device *vdev) +{ + return pll_ratio_to_dpu_freq(vdev, vdev->hw->pll.max_ratio); +} + /* Handler for IRQs from Buttress core (irqB) */ bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq) { @@ -582,9 +615,12 @@ bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq) if (!status) return false; - if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, FREQ_CHANGE, status)) - ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq: %08x", - REGB_RD32(VPU_HW_BTRS_MTL_CURRENT_PLL)); + if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, FREQ_CHANGE, status)) { + u32 pll = pll_config_get_mtl(vdev); + + ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq, wp %08x, %lu MHz", + pll, pll_ratio_to_dpu_freq_mtl(pll) / HZ_PER_MHZ); + } if (REG_TEST_FLD(VPU_HW_BTRS_MTL_INTERRUPT_STAT, ATS_ERR, status)) { ivpu_err(vdev, "ATS_ERR irq 0x%016llx", REGB_RD64(VPU_HW_BTRS_MTL_ATS_ERR_LOG_0)); @@ -633,8 +669,12 @@ bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq) queue_work(system_wq, &vdev->irq_dct_work); } - if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, FREQ_CHANGE, status)) - ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq: %08x", REGB_RD32(VPU_HW_BTRS_LNL_PLL_FREQ)); + if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, FREQ_CHANGE, status)) { + u32 pll = pll_config_get_lnl(vdev); + + ivpu_dbg(vdev, IRQ, "FREQ_CHANGE irq, wp %08x, %lu MHz", + pll, pll_ratio_to_dpu_freq_lnl(pll) / HZ_PER_MHZ); + } if (REG_TEST_FLD(VPU_HW_BTRS_LNL_INTERRUPT_STAT, ATS_ERR, status)) { ivpu_err(vdev, "ATS_ERR LOG1 0x%08x ATS_ERR_LOG2 0x%08x\n", @@ -717,60 +757,6 @@ void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 acti REGB_WR32(VPU_HW_BTRS_LNL_PCODE_MAILBOX_STATUS, val); } -static u32 pll_ratio_to_freq_mtl(u32 ratio, u32 config) -{ - u32 pll_clock = PLL_REF_CLK_FREQ * ratio; - u32 cpu_clock; - - if ((config & 0xff) == MTL_PLL_RATIO_4_3) - cpu_clock = pll_clock * 2 / 4; - else - cpu_clock = pll_clock * 2 / 5; - - return cpu_clock; -} - -u32 ivpu_hw_btrs_ratio_to_freq(struct ivpu_device *vdev, u32 ratio) -{ - struct ivpu_hw_info *hw = vdev->hw; - - if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) - return pll_ratio_to_freq_mtl(ratio, hw->config); - else - return PLL_RATIO_TO_FREQ(ratio); -} - -static u32 pll_freq_get_mtl(struct ivpu_device *vdev) -{ - u32 pll_curr_ratio; - - pll_curr_ratio = REGB_RD32(VPU_HW_BTRS_MTL_CURRENT_PLL); - pll_curr_ratio &= VPU_HW_BTRS_MTL_CURRENT_PLL_RATIO_MASK; - - if (!ivpu_is_silicon(vdev)) - return PLL_SIMULATION_FREQ; - - return pll_ratio_to_freq_mtl(pll_curr_ratio, vdev->hw->config); -} - -static u32 pll_freq_get_lnl(struct ivpu_device *vdev) -{ - u32 pll_curr_ratio; - - pll_curr_ratio = REGB_RD32(VPU_HW_BTRS_LNL_PLL_FREQ); - pll_curr_ratio &= VPU_HW_BTRS_LNL_PLL_FREQ_RATIO_MASK; - - return PLL_RATIO_TO_FREQ(pll_curr_ratio); -} - -u32 ivpu_hw_btrs_pll_freq_get(struct ivpu_device *vdev) -{ - if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) - return pll_freq_get_mtl(vdev); - else - return pll_freq_get_lnl(vdev); -} - u32 ivpu_hw_btrs_telemetry_offset_get(struct ivpu_device *vdev) { if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.h b/drivers/accel/ivpu/ivpu_hw_btrs.h index 1fd71b4d4ab0..a2f0877237e8 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs.h +++ b/drivers/accel/ivpu/ivpu_hw_btrs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #ifndef __IVPU_HW_BTRS_H__ @@ -13,7 +13,6 @@ #define PLL_PROFILING_FREQ_DEFAULT 38400000 #define PLL_PROFILING_FREQ_HIGH 400000000 -#define PLL_RATIO_TO_FREQ(x) ((x) * PLL_REF_CLK_FREQ) #define DCT_DEFAULT_ACTIVE_PERCENT 15u #define DCT_PERIOD_US 35300u @@ -32,12 +31,11 @@ int ivpu_hw_btrs_ip_reset(struct ivpu_device *vdev); void ivpu_hw_btrs_profiling_freq_reg_set_lnl(struct ivpu_device *vdev); void ivpu_hw_btrs_ats_print_lnl(struct ivpu_device *vdev); void ivpu_hw_btrs_clock_relinquish_disable_lnl(struct ivpu_device *vdev); +u32 ivpu_hw_btrs_dpu_max_freq_get(struct ivpu_device *vdev); bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq); bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq); int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable); void ivpu_hw_btrs_dct_set_status(struct ivpu_device *vdev, bool enable, u32 dct_percent); -u32 ivpu_hw_btrs_pll_freq_get(struct ivpu_device *vdev); -u32 ivpu_hw_btrs_ratio_to_freq(struct ivpu_device *vdev, u32 ratio); u32 ivpu_hw_btrs_telemetry_offset_get(struct ivpu_device *vdev); u32 ivpu_hw_btrs_telemetry_size_get(struct ivpu_device *vdev); u32 ivpu_hw_btrs_telemetry_enable_get(struct ivpu_device *vdev); diff --git a/include/uapi/drm/ivpu_accel.h b/include/uapi/drm/ivpu_accel.h index 746c43bd3eb6..2f24103f4533 100644 --- a/include/uapi/drm/ivpu_accel.h +++ b/include/uapi/drm/ivpu_accel.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only WITH Linux-syscall-note */ /* - * Copyright (C) 2020-2024 Intel Corporation + * Copyright (C) 2020-2025 Intel Corporation */ #ifndef __UAPI_IVPU_DRM_H__ @@ -147,7 +147,7 @@ struct drm_ivpu_param { * platform type when executing on a simulator or emulator (read-only) * * %DRM_IVPU_PARAM_CORE_CLOCK_RATE: - * Current PLL frequency (read-only) + * Maximum frequency of the NPU data processing unit clock (read-only) * * %DRM_IVPU_PARAM_NUM_CONTEXTS: * Maximum number of simultaneously existing contexts (read-only) -- cgit v1.2.3 From 1524c28b995279db7f01abda7bf0fd26e47aefba Mon Sep 17 00:00:00 2001 From: Andrzej Kacprowski Date: Tue, 1 Apr 2025 17:59:12 +0200 Subject: accel/ivpu: Show NPU frequency in sysfs Add sysfs files that show maximum and current frequency of the NPU's data processing unit. New sysfs entries: - npu_max_frequency_mhz - npu_current_frequency_mhz Signed-off-by: Andrzej Kacprowski Signed-off-by: Maciej Falkowski Reviewed-by: Jacek Lawrynowicz Signed-off-by: Jacek Lawrynowicz Link: https://lore.kernel.org/r/20250401155912.4049340-3-maciej.falkowski@linux.intel.com --- drivers/accel/ivpu/ivpu_hw.h | 5 ++++ drivers/accel/ivpu/ivpu_hw_btrs.c | 8 +++++++ drivers/accel/ivpu/ivpu_hw_btrs.h | 1 + drivers/accel/ivpu/ivpu_sysfs.c | 49 ++++++++++++++++++++++++++++++++++++++- 4 files changed, 62 insertions(+), 1 deletion(-) diff --git a/drivers/accel/ivpu/ivpu_hw.h b/drivers/accel/ivpu/ivpu_hw.h index 6e67e736ef8e..d79668fe1609 100644 --- a/drivers/accel/ivpu/ivpu_hw.h +++ b/drivers/accel/ivpu/ivpu_hw.h @@ -87,6 +87,11 @@ static inline u32 ivpu_hw_dpu_max_freq_get(struct ivpu_device *vdev) return ivpu_hw_btrs_dpu_max_freq_get(vdev); } +static inline u32 ivpu_hw_dpu_freq_get(struct ivpu_device *vdev) +{ + return ivpu_hw_btrs_dpu_freq_get(vdev); +} + static inline void ivpu_hw_irq_clear(struct ivpu_device *vdev) { ivpu_hw_ip_irq_clear(vdev); diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.c b/drivers/accel/ivpu/ivpu_hw_btrs.c index 91eb8a93c15b..b236c7234daa 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs.c +++ b/drivers/accel/ivpu/ivpu_hw_btrs.c @@ -606,6 +606,14 @@ u32 ivpu_hw_btrs_dpu_max_freq_get(struct ivpu_device *vdev) return pll_ratio_to_dpu_freq(vdev, vdev->hw->pll.max_ratio); } +u32 ivpu_hw_btrs_dpu_freq_get(struct ivpu_device *vdev) +{ + if (ivpu_hw_btrs_gen(vdev) == IVPU_HW_BTRS_MTL) + return pll_ratio_to_dpu_freq_mtl(pll_config_get_mtl(vdev)); + else + return pll_ratio_to_dpu_freq_lnl(pll_config_get_lnl(vdev)); +} + /* Handler for IRQs from Buttress core (irqB) */ bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq) { diff --git a/drivers/accel/ivpu/ivpu_hw_btrs.h b/drivers/accel/ivpu/ivpu_hw_btrs.h index a2f0877237e8..300f749971d4 100644 --- a/drivers/accel/ivpu/ivpu_hw_btrs.h +++ b/drivers/accel/ivpu/ivpu_hw_btrs.h @@ -32,6 +32,7 @@ void ivpu_hw_btrs_profiling_freq_reg_set_lnl(struct ivpu_device *vdev); void ivpu_hw_btrs_ats_print_lnl(struct ivpu_device *vdev); void ivpu_hw_btrs_clock_relinquish_disable_lnl(struct ivpu_device *vdev); u32 ivpu_hw_btrs_dpu_max_freq_get(struct ivpu_device *vdev); +u32 ivpu_hw_btrs_dpu_freq_get(struct ivpu_device *vdev); bool ivpu_hw_btrs_irq_handler_mtl(struct ivpu_device *vdev, int irq); bool ivpu_hw_btrs_irq_handler_lnl(struct ivpu_device *vdev, int irq); int ivpu_hw_btrs_dct_get_request(struct ivpu_device *vdev, bool *enable); diff --git a/drivers/accel/ivpu/ivpu_sysfs.c b/drivers/accel/ivpu/ivpu_sysfs.c index 97102feaf8dd..268ab7744a8b 100644 --- a/drivers/accel/ivpu/ivpu_sysfs.c +++ b/drivers/accel/ivpu/ivpu_sysfs.c @@ -1,10 +1,12 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation */ #include #include +#include +#include #include "ivpu_drv.h" #include "ivpu_gem.h" @@ -90,10 +92,55 @@ sched_mode_show(struct device *dev, struct device_attribute *attr, char *buf) static DEVICE_ATTR_RO(sched_mode); +/** + * DOC: npu_max_frequency + * + * The npu_max_frequency shows maximum frequency in MHz of the NPU's data + * processing unit + */ +static ssize_t +npu_max_frequency_mhz_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct drm_device *drm = dev_get_drvdata(dev); + struct ivpu_device *vdev = to_ivpu_device(drm); + u32 freq = ivpu_hw_dpu_max_freq_get(vdev); + + return sysfs_emit(buf, "%lu\n", freq / HZ_PER_MHZ); +} + +static DEVICE_ATTR_RO(npu_max_frequency_mhz); + +/** + * DOC: npu_current_frequency_mhz + * + * The npu_current_frequency_mhz shows current frequency in MHz of the NPU's + * data processing unit + */ +static ssize_t +npu_current_frequency_mhz_show(struct device *dev, struct device_attribute *attr, char *buf) +{ + struct drm_device *drm = dev_get_drvdata(dev); + struct ivpu_device *vdev = to_ivpu_device(drm); + u32 freq = 0; + + /* Read frequency only if device is active, otherwise frequency is 0 */ + if (pm_runtime_get_if_active(vdev->drm.dev) > 0) { + freq = ivpu_hw_dpu_freq_get(vdev); + + pm_runtime_put_autosuspend(vdev->drm.dev); + } + + return sysfs_emit(buf, "%lu\n", freq / HZ_PER_MHZ); +} + +static DEVICE_ATTR_RO(npu_current_frequency_mhz); + static struct attribute *ivpu_dev_attrs[] = { &dev_attr_npu_busy_time_us.attr, &dev_attr_npu_memory_utilization.attr, &dev_attr_sched_mode.attr, + &dev_attr_npu_max_frequency_mhz.attr, + &dev_attr_npu_current_frequency_mhz.attr, NULL, }; -- cgit v1.2.3 From 31660b406d872b5ccb3c2ec6f932969809c35b18 Mon Sep 17 00:00:00 2001 From: Karol Wachowski Date: Tue, 1 Apr 2025 17:59:39 +0200 Subject: accel/ivpu: Add cmdq_id to job related logs Add tracking of command queue ID in JOB debug message to improve debugging capabilities. Signed-off-by: Karol Wachowski Signed-off-by: Maciej Falkowski Reviewed-by: Lizhi Hou Reviewed-by: Jeff Hugo Signed-off-by: Jacek Lawrynowicz Link: https://lore.kernel.org/r/20250401155939.4049467-1-maciej.falkowski@linux.intel.com --- drivers/accel/ivpu/ivpu_job.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/accel/ivpu/ivpu_job.c b/drivers/accel/ivpu/ivpu_job.c index 004059e4f1e8..863e3cd6ace5 100644 --- a/drivers/accel/ivpu/ivpu_job.c +++ b/drivers/accel/ivpu/ivpu_job.c @@ -470,8 +470,8 @@ static void ivpu_job_destroy(struct ivpu_job *job) struct ivpu_device *vdev = job->vdev; u32 i; - ivpu_dbg(vdev, JOB, "Job destroyed: id %3u ctx %2d engine %d", - job->job_id, job->file_priv->ctx.id, job->engine_idx); + ivpu_dbg(vdev, JOB, "Job destroyed: id %3u ctx %2d cmdq_id %u engine %d", + job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx); for (i = 0; i < job->bo_count; i++) if (job->bos[i]) @@ -564,8 +564,8 @@ static int ivpu_job_signal_and_destroy(struct ivpu_device *vdev, u32 job_id, u32 dma_fence_signal(job->done_fence); trace_job("done", job); - ivpu_dbg(vdev, JOB, "Job complete: id %3u ctx %2d engine %d status 0x%x\n", - job->job_id, job->file_priv->ctx.id, job->engine_idx, job_status); + ivpu_dbg(vdev, JOB, "Job complete: id %3u ctx %2d cmdq_id %u engine %d status 0x%x\n", + job->job_id, job->file_priv->ctx.id, job->cmdq_id, job->engine_idx, job_status); ivpu_job_destroy(job); ivpu_stop_job_timeout_detection(vdev); @@ -664,8 +664,8 @@ static int ivpu_job_submit(struct ivpu_job *job, u8 priority, u32 cmdq_id) } trace_job("submit", job); - ivpu_dbg(vdev, JOB, "Job submitted: id %3u ctx %2d engine %d prio %d addr 0x%llx next %d\n", - job->job_id, file_priv->ctx.id, job->engine_idx, cmdq->priority, + ivpu_dbg(vdev, JOB, "Job submitted: id %3u ctx %2d cmdq_id %u engine %d prio %d addr 0x%llx next %d\n", + job->job_id, file_priv->ctx.id, cmdq->id, job->engine_idx, cmdq->priority, job->cmd_buf_vpu_addr, cmdq->jobq->header.tail); mutex_unlock(&file_priv->lock); @@ -777,7 +777,8 @@ static int ivpu_submit(struct drm_file *file, struct ivpu_file_priv *file_priv, goto err_free_handles; } - ivpu_dbg(vdev, JOB, "Submit ioctl: ctx %u buf_count %u\n", file_priv->ctx.id, buffer_count); + ivpu_dbg(vdev, JOB, "Submit ioctl: ctx %u cmdq_id %u buf_count %u\n", + file_priv->ctx.id, cmdq_id, buffer_count); job = ivpu_job_create(file_priv, engine, buffer_count); if (!job) { -- cgit v1.2.3 From 4767af82a08ffaa5e55fe71febfa8cdef201b620 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 10 Apr 2025 19:17:21 +0200 Subject: landlock: Log the TGID of the domain creator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As for other Audit's "pid" fields, Landlock should use the task's TGID instead of its TID. Fix this issue by keeping a reference to the TGID of the domain creator. Existing tests already check for the PID but only with the thread group leader, so always the TGID. A following patch adds dedicated tests for non-leader thread. Remove the current_real_cred() check which does not make sense because we only reference a struct pid, whereas a previous version did reference a struct cred instead. Cc: Christian Brauner Cc: Paul Moore Reviewed-by: Günther Noack Link: https://lore.kernel.org/r/20250410171725.1265860-1-mic@digikod.net Signed-off-by: Mickaël Salaün --- security/landlock/domain.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/security/landlock/domain.c b/security/landlock/domain.c index bae2e9909013..a647b68e8d06 100644 --- a/security/landlock/domain.c +++ b/security/landlock/domain.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "access.h" @@ -99,8 +100,7 @@ static struct landlock_details *get_current_details(void) return ERR_PTR(-ENOMEM); memcpy(details->exe_path, path_str, path_size); - WARN_ON_ONCE(current_cred() != current_real_cred()); - details->pid = get_pid(task_pid(current)); + details->pid = get_pid(task_tgid(current)); details->uid = from_kuid(&init_user_ns, current_uid()); get_task_comm(details->comm, current); return details; -- cgit v1.2.3 From e4a0f9e0cacd93094b619616426a273e0bc9107e Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 10 Apr 2025 19:17:22 +0200 Subject: selftests/landlock: Factor out audit fixture in audit_test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The audit fixture needlessly stores and manages domain_stack. Move it to the audit.layers tests. This will be useful to reuse the audit fixture with the next patch. Cc: Günther Noack Link: https://lore.kernel.org/r/20250410171725.1265860-2-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/audit_test.c | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/tools/testing/selftests/landlock/audit_test.c b/tools/testing/selftests/landlock/audit_test.c index a0643070c403..815c0f03e1fb 100644 --- a/tools/testing/selftests/landlock/audit_test.c +++ b/tools/testing/selftests/landlock/audit_test.c @@ -40,7 +40,6 @@ FIXTURE(audit) { struct audit_filter audit_filter; int audit_fd; - __u64(*domain_stack)[16]; }; FIXTURE_SETUP(audit) @@ -60,18 +59,10 @@ FIXTURE_SETUP(audit) TH_LOG("Failed to initialize audit: %s", error_msg); } clear_cap(_metadata, CAP_AUDIT_CONTROL); - - self->domain_stack = mmap(NULL, sizeof(*self->domain_stack), - PROT_READ | PROT_WRITE, - MAP_SHARED | MAP_ANONYMOUS, -1, 0); - ASSERT_NE(MAP_FAILED, self->domain_stack); - memset(self->domain_stack, 0, sizeof(*self->domain_stack)); } FIXTURE_TEARDOWN(audit) { - EXPECT_EQ(0, munmap(self->domain_stack, sizeof(*self->domain_stack))); - set_cap(_metadata, CAP_AUDIT_CONTROL); EXPECT_EQ(0, audit_cleanup(self->audit_fd, &self->audit_filter)); clear_cap(_metadata, CAP_AUDIT_CONTROL); @@ -83,9 +74,15 @@ TEST_F(audit, layers) .scoped = LANDLOCK_SCOPE_SIGNAL, }; int status, ruleset_fd, i; + __u64(*domain_stack)[16]; __u64 prev_dom = 3; pid_t child; + domain_stack = mmap(NULL, sizeof(*domain_stack), PROT_READ | PROT_WRITE, + MAP_SHARED | MAP_ANONYMOUS, -1, 0); + ASSERT_NE(MAP_FAILED, domain_stack); + memset(domain_stack, 0, sizeof(*domain_stack)); + ruleset_fd = landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); ASSERT_LE(0, ruleset_fd); @@ -94,7 +91,7 @@ TEST_F(audit, layers) child = fork(); ASSERT_LE(0, child); if (child == 0) { - for (i = 0; i < ARRAY_SIZE(*self->domain_stack); i++) { + for (i = 0; i < ARRAY_SIZE(*domain_stack); i++) { __u64 denial_dom = 1; __u64 allocated_dom = 2; @@ -115,7 +112,7 @@ TEST_F(audit, layers) /* Checks that the new domain is younger than the previous one. */ EXPECT_GT(allocated_dom, prev_dom); prev_dom = allocated_dom; - (*self->domain_stack)[i] = allocated_dom; + (*domain_stack)[i] = allocated_dom; } /* Checks that we reached the maximum number of layers. */ @@ -142,20 +139,20 @@ TEST_F(audit, layers) /* Purges log from deallocated domains. */ EXPECT_EQ(0, setsockopt(self->audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_dom_drop, sizeof(audit_tv_dom_drop))); - for (i = ARRAY_SIZE(*self->domain_stack) - 1; i >= 0; i--) { + for (i = ARRAY_SIZE(*domain_stack) - 1; i >= 0; i--) { __u64 deallocated_dom = 2; EXPECT_EQ(0, matches_log_domain_deallocated(self->audit_fd, 1, &deallocated_dom)); - EXPECT_EQ((*self->domain_stack)[i], deallocated_dom) + EXPECT_EQ((*domain_stack)[i], deallocated_dom) { TH_LOG("Failed to match domain %llx (#%d)", - (*self->domain_stack)[i], i); + (*domain_stack)[i], i); } } + EXPECT_EQ(0, munmap(domain_stack, sizeof(*domain_stack))); EXPECT_EQ(0, setsockopt(self->audit_fd, SOL_SOCKET, SO_RCVTIMEO, &audit_tv_default, sizeof(audit_tv_default))); - EXPECT_EQ(0, close(ruleset_fd)); } -- cgit v1.2.3 From 6b4566400a2919e6c1137404c53d7cf1ada559aa Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Thu, 10 Apr 2025 19:17:23 +0200 Subject: selftests/landlock: Add PID tests for audit records MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add audit.thread tests to check that the PID tied to a domain is not a thread ID but the thread group ID. These new tests would not pass without the previous TGID fix. Extend matches_log_domain_allocated() to check against the PID that created the domain. Test coverage for security/landlock is 93.6% of 1524 lines according to gcc/gcov-14. Cc: Christian Brauner Cc: Günther Noack Cc: Paul Moore Link: https://lore.kernel.org/r/20250410171725.1265860-3-mic@digikod.net Signed-off-by: Mickaël Salaün --- tools/testing/selftests/landlock/audit.h | 21 +++-- tools/testing/selftests/landlock/audit_test.c | 127 +++++++++++++++++++++++++- tools/testing/selftests/landlock/fs_test.c | 3 +- 3 files changed, 141 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/landlock/audit.h b/tools/testing/selftests/landlock/audit.h index b9054086a0c9..18a6014920b5 100644 --- a/tools/testing/selftests/landlock/audit.h +++ b/tools/testing/selftests/landlock/audit.h @@ -300,15 +300,22 @@ out: return err; } -static int __maybe_unused matches_log_domain_allocated(int audit_fd, +static int __maybe_unused matches_log_domain_allocated(int audit_fd, pid_t pid, __u64 *domain_id) { - return audit_match_record( - audit_fd, AUDIT_LANDLOCK_DOMAIN, - REGEX_LANDLOCK_PREFIX - " status=allocated mode=enforcing pid=[0-9]\\+ uid=[0-9]\\+" - " exe=\"[^\"]\\+\" comm=\".*_test\"$", - domain_id); + static const char log_template[] = REGEX_LANDLOCK_PREFIX + " status=allocated mode=enforcing pid=%d uid=[0-9]\\+" + " exe=\"[^\"]\\+\" comm=\".*_test\"$"; + char log_match[sizeof(log_template) + 10]; + int log_match_len; + + log_match_len = + snprintf(log_match, sizeof(log_match), log_template, pid); + if (log_match_len > sizeof(log_match)) + return -E2BIG; + + return audit_match_record(audit_fd, AUDIT_LANDLOCK_DOMAIN, log_match, + domain_id); } static int __maybe_unused matches_log_domain_deallocated( diff --git a/tools/testing/selftests/landlock/audit_test.c b/tools/testing/selftests/landlock/audit_test.c index 815c0f03e1fb..cfc571afd0eb 100644 --- a/tools/testing/selftests/landlock/audit_test.c +++ b/tools/testing/selftests/landlock/audit_test.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -104,7 +105,8 @@ TEST_F(audit, layers) matches_log_signal(_metadata, self->audit_fd, getppid(), &denial_dom)); EXPECT_EQ(0, matches_log_domain_allocated( - self->audit_fd, &allocated_dom)); + self->audit_fd, getpid(), + &allocated_dom)); EXPECT_NE(denial_dom, 1); EXPECT_NE(denial_dom, 0); EXPECT_EQ(denial_dom, allocated_dom); @@ -156,6 +158,126 @@ TEST_F(audit, layers) EXPECT_EQ(0, close(ruleset_fd)); } +struct thread_data { + pid_t parent_pid; + int ruleset_fd, pipe_child, pipe_parent; +}; + +static void *thread_audit_test(void *arg) +{ + const struct thread_data *data = (struct thread_data *)arg; + uintptr_t err = 0; + char buffer; + + /* TGID and TID are different for a second thread. */ + if (getpid() == gettid()) { + err = 1; + goto out; + } + + if (landlock_restrict_self(data->ruleset_fd, 0)) { + err = 2; + goto out; + } + + if (close(data->ruleset_fd)) { + err = 3; + goto out; + } + + /* Creates a denial to get the domain ID. */ + if (kill(data->parent_pid, 0) != -1) { + err = 4; + goto out; + } + + if (EPERM != errno) { + err = 5; + goto out; + } + + /* Signals the parent to read denial logs. */ + if (write(data->pipe_child, ".", 1) != 1) { + err = 6; + goto out; + } + + /* Waits for the parent to update audit filters. */ + if (read(data->pipe_parent, &buffer, 1) != 1) { + err = 7; + goto out; + } + +out: + close(data->pipe_child); + close(data->pipe_parent); + return (void *)err; +} + +/* Checks that the PID tied to a domain is not a TID but the TGID. */ +TEST_F(audit, thread) +{ + const struct landlock_ruleset_attr ruleset_attr = { + .scoped = LANDLOCK_SCOPE_SIGNAL, + }; + __u64 denial_dom = 1; + __u64 allocated_dom = 2; + __u64 deallocated_dom = 3; + pthread_t thread; + int pipe_child[2], pipe_parent[2]; + char buffer; + struct thread_data child_data; + + child_data.parent_pid = getppid(); + ASSERT_EQ(0, pipe2(pipe_child, O_CLOEXEC)); + child_data.pipe_child = pipe_child[1]; + ASSERT_EQ(0, pipe2(pipe_parent, O_CLOEXEC)); + child_data.pipe_parent = pipe_parent[0]; + child_data.ruleset_fd = + landlock_create_ruleset(&ruleset_attr, sizeof(ruleset_attr), 0); + ASSERT_LE(0, child_data.ruleset_fd); + + /* TGID and TID are the same for the initial thread . */ + EXPECT_EQ(getpid(), gettid()); + EXPECT_EQ(0, prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)); + ASSERT_EQ(0, pthread_create(&thread, NULL, thread_audit_test, + &child_data)); + + /* Waits for the child to generate a denial. */ + ASSERT_EQ(1, read(pipe_child[0], &buffer, 1)); + EXPECT_EQ(0, close(pipe_child[0])); + + /* Matches the signal log to get the domain ID. */ + EXPECT_EQ(0, matches_log_signal(_metadata, self->audit_fd, + child_data.parent_pid, &denial_dom)); + EXPECT_NE(denial_dom, 1); + EXPECT_NE(denial_dom, 0); + + EXPECT_EQ(0, matches_log_domain_allocated(self->audit_fd, getpid(), + &allocated_dom)); + EXPECT_EQ(denial_dom, allocated_dom); + + /* Updates filter rules to match the drop record. */ + set_cap(_metadata, CAP_AUDIT_CONTROL); + EXPECT_EQ(0, audit_filter_drop(self->audit_fd, AUDIT_ADD_RULE)); + EXPECT_EQ(0, audit_filter_exe(self->audit_fd, &self->audit_filter, + AUDIT_DEL_RULE)); + clear_cap(_metadata, CAP_AUDIT_CONTROL); + + /* Signals the thread to exit, which will generate a domain deallocation. */ + ASSERT_EQ(1, write(pipe_parent[1], ".", 1)); + EXPECT_EQ(0, close(pipe_parent[1])); + ASSERT_EQ(0, pthread_join(thread, NULL)); + + EXPECT_EQ(0, setsockopt(self->audit_fd, SOL_SOCKET, SO_RCVTIMEO, + &audit_tv_dom_drop, sizeof(audit_tv_dom_drop))); + EXPECT_EQ(0, matches_log_domain_deallocated(self->audit_fd, 1, + &deallocated_dom)); + EXPECT_EQ(denial_dom, deallocated_dom); + EXPECT_EQ(0, setsockopt(self->audit_fd, SOL_SOCKET, SO_RCVTIMEO, + &audit_tv_default, sizeof(audit_tv_default))); +} + FIXTURE(audit_flags) { struct audit_filter audit_filter; @@ -270,7 +392,8 @@ TEST_F(audit_flags, signal) /* Checks domain information records. */ EXPECT_EQ(0, matches_log_domain_allocated( - self->audit_fd, &allocated_dom)); + self->audit_fd, getpid(), + &allocated_dom)); EXPECT_NE(*self->domain_id, 1); EXPECT_NE(*self->domain_id, 0); EXPECT_EQ(*self->domain_id, allocated_dom); diff --git a/tools/testing/selftests/landlock/fs_test.c b/tools/testing/selftests/landlock/fs_test.c index f819011a8798..73729382d40f 100644 --- a/tools/testing/selftests/landlock/fs_test.c +++ b/tools/testing/selftests/landlock/fs_test.c @@ -5964,7 +5964,8 @@ TEST_F(audit_layout1, refer_handled) EXPECT_EQ(EXDEV, errno); EXPECT_EQ(0, matches_log_fs(_metadata, self->audit_fd, "fs\\.refer", dir_s1d1)); - EXPECT_EQ(0, matches_log_domain_allocated(self->audit_fd, NULL)); + EXPECT_EQ(0, + matches_log_domain_allocated(self->audit_fd, getpid(), NULL)); EXPECT_EQ(0, matches_log_fs(_metadata, self->audit_fd, "fs\\.refer", dir_s1d3)); -- cgit v1.2.3 From d27326a9999286fa45ad063f760e63329254f130 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 8 Apr 2025 14:01:26 +0300 Subject: dma-buf/sw_sync: Decrement refcount on error in sw_sync_ioctl_get_deadline() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Call dma_fence_put(fence) before returning an error if dma_fence_to_sync_pt() fails. Use an unwind ladder at the end of the function to do the cleanup. Fixes: 70e67aaec2f4 ("dma-buf/sw_sync: Add fence deadline support") Signed-off-by: Dan Carpenter Reviewed-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/a010a1ac-107b-4fc0-a052-9fd3706ad690@stanley.mountain Signed-off-by: Christian König --- drivers/dma-buf/sw_sync.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c index f5905d67dedb..22a808995f10 100644 --- a/drivers/dma-buf/sw_sync.c +++ b/drivers/dma-buf/sw_sync.c @@ -438,15 +438,17 @@ static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long a return -EINVAL; pt = dma_fence_to_sync_pt(fence); - if (!pt) - return -EINVAL; + if (!pt) { + ret = -EINVAL; + goto put_fence; + } spin_lock_irqsave(fence->lock, flags); - if (test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { - data.deadline_ns = ktime_to_ns(pt->deadline); - } else { + if (!test_bit(SW_SYNC_HAS_DEADLINE_BIT, &fence->flags)) { ret = -ENOENT; + goto unlock; } + data.deadline_ns = ktime_to_ns(pt->deadline); spin_unlock_irqrestore(fence->lock, flags); dma_fence_put(fence); @@ -458,6 +460,13 @@ static int sw_sync_ioctl_get_deadline(struct sync_timeline *obj, unsigned long a return -EFAULT; return 0; + +unlock: + spin_unlock_irqrestore(fence->lock, flags); +put_fence: + dma_fence_put(fence); + + return ret; } static long sw_sync_ioctl(struct file *file, unsigned int cmd, -- cgit v1.2.3 From af1352f82729d742506715176c15065a6b583167 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 10 Apr 2025 18:18:23 +0300 Subject: Revert "xhci: Avoid queuing redundant Stop Endpoint command for stalled endpoint" This reverts commit 0c74d232578b1a7071e0312312811cb75b26b202. Paul Menzel reported that the two EP_STALLED patches in 6.15-rc1 cause regression. Turns out that the new flag may never get cleared after reset-resume, preventing xhci from restarting the endpoint. Revert this to take a proper look at it. Link: https://lore.kernel.org/linux-usb/84b400f8-2943-44e0-8803-f3aac3b670af@molgen.mpg.de cc: Paul Menzel cc: Michal Pecio Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250410151828.2868740-2-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 0452b8d65832..6370874bf265 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1770,8 +1770,8 @@ static int xhci_urb_dequeue(struct usb_hcd *hcd, struct urb *urb, int status) goto done; } - /* In these cases no commands are pending but the endpoint is stopped */ - if (ep->ep_state & (EP_CLEARING_TT | EP_STALLED)) { + /* In this case no commands are pending but the endpoint is stopped */ + if (ep->ep_state & EP_CLEARING_TT) { /* and cancelled TDs can be given back right away */ xhci_dbg(xhci, "Invalidating TDs instantly on slot %d ep %d in state 0x%x\n", urb->dev->slot_id, ep_index, ep->ep_state); @@ -3208,12 +3208,10 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd, return; ep = &vdev->eps[ep_index]; - - spin_lock_irqsave(&xhci->lock, flags); - ep->ep_state &= ~EP_STALLED; /* Bail out if toggle is already being cleared by a endpoint reset */ + spin_lock_irqsave(&xhci->lock, flags); if (ep->ep_state & EP_HARD_CLEAR_TOGGLE) { ep->ep_state &= ~EP_HARD_CLEAR_TOGGLE; spin_unlock_irqrestore(&xhci->lock, flags); -- cgit v1.2.3 From b513cc1905bb360f48be281a1ded272131a8227a Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 10 Apr 2025 18:18:24 +0300 Subject: Revert "xhci: Prevent early endpoint restart when handling STALL errors." This reverts commit 860f5d0d3594005d4588240028f42e8d2bfc725b. Paul Menzel reported that the two EP_STALLED patches in 6.15-rc1 cause regression. Turns out that the new flag may never get cleared after reset-resume, preventing xhci from restarting the endpoint. Revert this to take a proper look at it. Link: https://lore.kernel.org/linux-usb/84b400f8-2943-44e0-8803-f3aac3b670af@molgen.mpg.de cc: Paul Menzel cc: Michal Pecio Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250410151828.2868740-3-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 7 ++----- drivers/usb/host/xhci.c | 6 ------ drivers/usb/host/xhci.h | 3 +-- 3 files changed, 3 insertions(+), 13 deletions(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 5d64c297721c..94a2ae2c52e2 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -561,8 +561,8 @@ void xhci_ring_ep_doorbell(struct xhci_hcd *xhci, * pointer command pending because the device can choose to start any * stream once the endpoint is on the HW schedule. */ - if (ep_state & (EP_STOP_CMD_PENDING | SET_DEQ_PENDING | EP_HALTED | - EP_CLEARING_TT | EP_STALLED)) + if ((ep_state & EP_STOP_CMD_PENDING) || (ep_state & SET_DEQ_PENDING) || + (ep_state & EP_HALTED) || (ep_state & EP_CLEARING_TT)) return; trace_xhci_ring_ep_doorbell(slot_id, DB_VALUE(ep_index, stream_id)); @@ -2573,9 +2573,6 @@ static void process_bulk_intr_td(struct xhci_hcd *xhci, struct xhci_virt_ep *ep, xhci_handle_halted_endpoint(xhci, ep, td, EP_SOFT_RESET); return; - case COMP_STALL_ERROR: - ep->ep_state |= EP_STALLED; - break; default: /* do nothing */ break; diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index 6370874bf265..ca390beda85b 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -1605,11 +1605,6 @@ static int xhci_urb_enqueue(struct usb_hcd *hcd, struct urb *urb, gfp_t mem_flag goto free_priv; } - /* Class driver might not be aware ep halted due to async URB giveback */ - if (*ep_state & EP_STALLED) - dev_dbg(&urb->dev->dev, "URB %p queued before clearing halt\n", - urb); - switch (usb_endpoint_type(&urb->ep->desc)) { case USB_ENDPOINT_XFER_CONTROL: @@ -3208,7 +3203,6 @@ static void xhci_endpoint_reset(struct usb_hcd *hcd, return; ep = &vdev->eps[ep_index]; - ep->ep_state &= ~EP_STALLED; /* Bail out if toggle is already being cleared by a endpoint reset */ spin_lock_irqsave(&xhci->lock, flags); diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 37860f1e3aba..28b6264f8b87 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -664,7 +664,7 @@ struct xhci_virt_ep { unsigned int err_count; unsigned int ep_state; #define SET_DEQ_PENDING (1 << 0) -#define EP_HALTED (1 << 1) /* Halted host ep handling */ +#define EP_HALTED (1 << 1) /* For stall handling */ #define EP_STOP_CMD_PENDING (1 << 2) /* For URB cancellation */ /* Transitioning the endpoint to using streams, don't enqueue URBs */ #define EP_GETTING_STREAMS (1 << 3) @@ -675,7 +675,6 @@ struct xhci_virt_ep { #define EP_SOFT_CLEAR_TOGGLE (1 << 7) /* usb_hub_clear_tt_buffer is in progress */ #define EP_CLEARING_TT (1 << 8) -#define EP_STALLED (1 << 9) /* For stall handling */ /* ---- Related to URB cancellation ---- */ struct list_head cancelled_td_list; struct xhci_hcd *xhci; -- cgit v1.2.3 From 9e3a28793d2fde7a709e814d2504652eaba6ae98 Mon Sep 17 00:00:00 2001 From: Michal Pecio Date: Thu, 10 Apr 2025 18:18:25 +0300 Subject: usb: xhci: Fix Short Packet handling rework ignoring errors A Short Packet event before the last TRB of a TD is followed by another event on the final TRB on spec-compliant HCs, which is most of them. A 'last_td_was_short' flag was added to know if a TD has just completed as Short Packet and another event is to come. The flag was cleared after seeing the event (unless no TDs are pending, but that's a separate bug) or seeing a new TD complete as something other than Short Packet. A rework replaced the flag with an 'old_trb_comp_code' variable. When an event doesn't match the pending TD and the previous event was Short Packet, the new event is silently ignored. To preserve old behavior, 'old_trb_comp_code' should be cleared at this point, but instead it is being set to current comp code, which is often Short Packet again. This can cause more events to be silently ignored, even though they are no longer connected with the old TD that completed short and indicate a serious problem with the driver or the xHC. Common device classes like UAC in async mode, UVC, serial or the UAS status pipe complete as Short Packet routinely and could be affected. Clear 'old_trb_comp_code' to zero, which is an invalid completion code and the same value the variable starts with. This restores original behavior on Short Packet and also works for illegal Etron events, which the code has been extended to cover too. Fixes: b331a3d8097f ("xhci: Handle spurious events on Etron host isoc enpoints") Signed-off-by: Michal Pecio Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250410151828.2868740-4-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 94a2ae2c52e2..4e975caca235 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -2913,7 +2913,7 @@ static int handle_tx_event(struct xhci_hcd *xhci, if (xhci_spurious_success_tx_event(xhci, ep_ring)) { xhci_dbg(xhci, "Spurious event dma %pad, comp_code %u after %u\n", &ep_trb_dma, trb_comp_code, ep_ring->old_trb_comp_code); - ep_ring->old_trb_comp_code = trb_comp_code; + ep_ring->old_trb_comp_code = 0; return 0; } -- cgit v1.2.3 From 1ea050da5562af9b930d17cbbe9632d30f5df43a Mon Sep 17 00:00:00 2001 From: Michal Pecio Date: Thu, 10 Apr 2025 18:18:26 +0300 Subject: usb: xhci: Fix invalid pointer dereference in Etron workaround This check is performed before prepare_transfer() and prepare_ring(), so enqueue can already point at the final link TRB of a segment. And indeed it will, some 0.4% of times this code is called. Then enqueue + 1 is an invalid pointer. It will crash the kernel right away or load some junk which may look like a link TRB and cause the real link TRB to be replaced with a NOOP. This wouldn't end well. Use a functionally equivalent test which doesn't dereference the pointer and always gives correct result. Something has crashed my machine twice in recent days while playing with an Etron HC, and a control transfer stress test ran for confirmation has just crashed it again. The same test passes with this patch applied. Fixes: 5e1c67abc930 ("xhci: Fix control transfer error on Etron xHCI host") Cc: stable@vger.kernel.org Signed-off-by: Michal Pecio Signed-off-by: Mathias Nyman Reviewed-by: Kuangyi Chiang Link: https://lore.kernel.org/r/20250410151828.2868740-5-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci-ring.c b/drivers/usb/host/xhci-ring.c index 4e975caca235..b906bc2eea5f 100644 --- a/drivers/usb/host/xhci-ring.c +++ b/drivers/usb/host/xhci-ring.c @@ -3777,7 +3777,7 @@ int xhci_queue_ctrl_tx(struct xhci_hcd *xhci, gfp_t mem_flags, * enqueue a No Op TRB, this can prevent the Setup and Data Stage * TRB to be breaked by the Link TRB. */ - if (trb_is_link(ep_ring->enqueue + 1)) { + if (last_trb_on_seg(ep_ring->enq_seg, ep_ring->enqueue + 1)) { field = TRB_TYPE(TRB_TR_NOOP) | ep_ring->cycle_state; queue_trb(xhci, ep_ring, false, 0, 0, TRB_INTR_TARGET(0), field); -- cgit v1.2.3 From bea5892d0ed274e03655223d1977cf59f9aff2f2 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Thu, 10 Apr 2025 18:18:27 +0300 Subject: xhci: Limit time spent with xHC interrupts disabled during bus resume Current xhci bus resume implementation prevents xHC host from generating interrupts during high-speed USB 2 and super-speed USB 3 bus resume. Only reason to disable interrupts during bus resume would be to prevent the interrupt handler from interfering with the resume process of USB 2 ports. Host initiated resume of USB 2 ports is done in two stages. The xhci driver first transitions the port from 'U3' to 'Resume' state, then wait in Resume for 20ms, and finally moves port to U0 state. xhci driver can't prevent interrupts by keeping the xhci spinlock due to this 20ms sleep. Limit interrupt disabling to the USB 2 port resume case only. resuming USB 2 ports in bus resume is only done in special cases where USB 2 ports had to be forced to suspend during bus suspend. The current way of preventing interrupts by clearing the 'Interrupt Enable' (INTE) bit in USBCMD register won't prevent the Interrupter registers 'Interrupt Pending' (IP), 'Event Handler Busy' (EHB) and USBSTS register Event Interrupt (EINT) bits from being set. New interrupts can't be issued before those bits are properly clered. Disable interrupts by clearing the interrupter register 'Interrupt Enable' (IE) bit instead. This way IP, EHB and INTE won't be set before IE is enabled again and a new interrupt is triggered. Reported-by: Devyn Liu Closes: https://lore.kernel.org/linux-usb/b1a9e2d51b4d4ff7a304f77c5be8164e@huawei.com/ Cc: stable@vger.kernel.org Tested-by: Devyn Liu Signed-off-by: Mathias Nyman Link: https://lore.kernel.org/r/20250410151828.2868740-6-mathias.nyman@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-hub.c | 30 ++++++++++++++++-------------- drivers/usb/host/xhci.c | 4 ++-- drivers/usb/host/xhci.h | 2 ++ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/drivers/usb/host/xhci-hub.c b/drivers/usb/host/xhci-hub.c index c0f226584a40..486347776cb2 100644 --- a/drivers/usb/host/xhci-hub.c +++ b/drivers/usb/host/xhci-hub.c @@ -1878,9 +1878,10 @@ int xhci_bus_resume(struct usb_hcd *hcd) int max_ports, port_index; int sret; u32 next_state; - u32 temp, portsc; + u32 portsc; struct xhci_hub *rhub; struct xhci_port **ports; + bool disabled_irq = false; rhub = xhci_get_rhub(hcd); ports = rhub->ports; @@ -1896,17 +1897,20 @@ int xhci_bus_resume(struct usb_hcd *hcd) return -ESHUTDOWN; } - /* delay the irqs */ - temp = readl(&xhci->op_regs->command); - temp &= ~CMD_EIE; - writel(temp, &xhci->op_regs->command); - /* bus specific resume for ports we suspended at bus_suspend */ - if (hcd->speed >= HCD_USB3) + if (hcd->speed >= HCD_USB3) { next_state = XDEV_U0; - else + } else { next_state = XDEV_RESUME; - + if (bus_state->bus_suspended) { + /* + * prevent port event interrupts from interfering + * with usb2 port resume process + */ + xhci_disable_interrupter(xhci->interrupters[0]); + disabled_irq = true; + } + } port_index = max_ports; while (port_index--) { portsc = readl(ports[port_index]->addr); @@ -1974,11 +1978,9 @@ int xhci_bus_resume(struct usb_hcd *hcd) (void) readl(&xhci->op_regs->command); bus_state->next_statechange = jiffies + msecs_to_jiffies(5); - /* re-enable irqs */ - temp = readl(&xhci->op_regs->command); - temp |= CMD_EIE; - writel(temp, &xhci->op_regs->command); - temp = readl(&xhci->op_regs->command); + /* re-enable interrupter */ + if (disabled_irq) + xhci_enable_interrupter(xhci->interrupters[0]); spin_unlock_irqrestore(&xhci->lock, flags); return 0; diff --git a/drivers/usb/host/xhci.c b/drivers/usb/host/xhci.c index ca390beda85b..90eb491267b5 100644 --- a/drivers/usb/host/xhci.c +++ b/drivers/usb/host/xhci.c @@ -322,7 +322,7 @@ static void xhci_zero_64b_regs(struct xhci_hcd *xhci) xhci_info(xhci, "Fault detected\n"); } -static int xhci_enable_interrupter(struct xhci_interrupter *ir) +int xhci_enable_interrupter(struct xhci_interrupter *ir) { u32 iman; @@ -335,7 +335,7 @@ static int xhci_enable_interrupter(struct xhci_interrupter *ir) return 0; } -static int xhci_disable_interrupter(struct xhci_interrupter *ir) +int xhci_disable_interrupter(struct xhci_interrupter *ir) { u32 iman; diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index 28b6264f8b87..242ab9fbc8ae 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1890,6 +1890,8 @@ int xhci_alloc_tt_info(struct xhci_hcd *xhci, struct usb_tt *tt, gfp_t mem_flags); int xhci_set_interrupter_moderation(struct xhci_interrupter *ir, u32 imod_interval); +int xhci_enable_interrupter(struct xhci_interrupter *ir); +int xhci_disable_interrupter(struct xhci_interrupter *ir); /* xHCI ring, segment, TRB, and TD functions */ dma_addr_t xhci_trb_virt_to_dma(struct xhci_segment *seg, union xhci_trb *trb); -- cgit v1.2.3 From 6907e8093b3070d877ee607e5ceede60cfd08bde Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Fri, 11 Apr 2025 12:22:39 +0100 Subject: nvmem: rockchip-otp: Move read-offset into variant-data The RK3588 has an offset into the OTP area where the readable area begins and automatically adds this to the start address. Other variants are very much similar to rk3588, just with a different offset, so move that value into variant-data. To match the size in bytes, store this value also in bytes and not in number of blocks. Signed-off-by: Heiko Stuebner Tested-by: Nicolas Frattaroli Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-2-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/rockchip-otp.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/nvmem/rockchip-otp.c b/drivers/nvmem/rockchip-otp.c index ebc3f0b24166..3edfbfc2d722 100644 --- a/drivers/nvmem/rockchip-otp.c +++ b/drivers/nvmem/rockchip-otp.c @@ -59,7 +59,6 @@ #define RK3588_OTPC_AUTO_EN 0x08 #define RK3588_OTPC_INT_ST 0x84 #define RK3588_OTPC_DOUT0 0x20 -#define RK3588_NO_SECURE_OFFSET 0x300 #define RK3588_NBYTES 4 #define RK3588_BURST_NUM 1 #define RK3588_BURST_SHIFT 8 @@ -69,6 +68,7 @@ struct rockchip_data { int size; + int read_offset; const char * const *clks; int num_clks; nvmem_reg_read_t reg_read; @@ -196,7 +196,7 @@ static int rk3588_otp_read(void *context, unsigned int offset, addr_start = round_down(offset, RK3588_NBYTES) / RK3588_NBYTES; addr_end = round_up(offset + bytes, RK3588_NBYTES) / RK3588_NBYTES; addr_len = addr_end - addr_start; - addr_start += RK3588_NO_SECURE_OFFSET; + addr_start += otp->data->read_offset / RK3588_NBYTES; buf = kzalloc(array_size(addr_len, RK3588_NBYTES), GFP_KERNEL); if (!buf) @@ -280,6 +280,7 @@ static const char * const rk3588_otp_clocks[] = { static const struct rockchip_data rk3588_data = { .size = 0x400, + .read_offset = 0xc00, .clks = rk3588_otp_clocks, .num_clks = ARRAY_SIZE(rk3588_otp_clocks), .reg_read = rk3588_otp_read, -- cgit v1.2.3 From 1b23c14c07326a095b93145ca9ea31cf53d4bde1 Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Fri, 11 Apr 2025 12:22:40 +0100 Subject: dt-bindings: nvmem: rockchip,otp: add missing limits for clock-names The clocks property correctly declares minItems and maxItems for its variants, but clock-names does not. Both properties are always used together, so should declare the same limits. Suggested-by: Krzysztof Kozlowski Signed-off-by: Heiko Stuebner Acked-by: Conor Dooley Tested-by: Nicolas Frattaroli Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-3-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml b/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml index a44d44b32809..3201ff8f9334 100644 --- a/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml +++ b/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml @@ -62,6 +62,8 @@ allOf: properties: clocks: maxItems: 3 + clock-names: + maxItems: 3 resets: maxItems: 1 reset-names: @@ -78,6 +80,8 @@ allOf: properties: clocks: minItems: 4 + clock-names: + minItems: 4 resets: minItems: 3 reset-names: -- cgit v1.2.3 From 9165960606dff725174155472583efec60f25bab Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Fri, 11 Apr 2025 12:22:41 +0100 Subject: dt-bindings: nvmem: rockchip,otp: Add compatible for RK3576 Document the OTP memory found on Rockchip RK3576 SoC. The RK3576 uses the same set of clocks as the px30/rk3308 but has one reset more, so adapt the binding to handle this variant as well. Signed-off-by: Heiko Stuebner Acked-by: Conor Dooley Tested-by: Nicolas Frattaroli Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-4-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/nvmem/rockchip,otp.yaml | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml b/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml index 3201ff8f9334..dc89020b0950 100644 --- a/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml +++ b/Documentation/devicetree/bindings/nvmem/rockchip,otp.yaml @@ -14,6 +14,7 @@ properties: enum: - rockchip,px30-otp - rockchip,rk3308-otp + - rockchip,rk3576-otp - rockchip,rk3588-otp reg: @@ -70,6 +71,26 @@ allOf: items: - const: phy + - if: + properties: + compatible: + contains: + enum: + - rockchip,rk3576-otp + then: + properties: + clocks: + maxItems: 3 + clock-names: + maxItems: 3 + resets: + minItems: 2 + maxItems: 2 + reset-names: + items: + - const: otp + - const: apb + - if: properties: compatible: -- cgit v1.2.3 From 50d75a13a9ce880a5ef07a4ccc63ba561cc2e69a Mon Sep 17 00:00:00 2001 From: Heiko Stuebner Date: Fri, 11 Apr 2025 12:22:42 +0100 Subject: nvmem: rockchip-otp: add rk3576 variant data The variant works very similar to the rk3588, just with a different read-offset and size. Signed-off-by: Heiko Stuebner Tested-by: Nicolas Frattaroli Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-5-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/rockchip-otp.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvmem/rockchip-otp.c b/drivers/nvmem/rockchip-otp.c index 3edfbfc2d722..d88f12c53242 100644 --- a/drivers/nvmem/rockchip-otp.c +++ b/drivers/nvmem/rockchip-otp.c @@ -274,6 +274,14 @@ static const struct rockchip_data px30_data = { .reg_read = px30_otp_read, }; +static const struct rockchip_data rk3576_data = { + .size = 0x100, + .read_offset = 0x700, + .clks = px30_otp_clocks, + .num_clks = ARRAY_SIZE(px30_otp_clocks), + .reg_read = rk3588_otp_read, +}; + static const char * const rk3588_otp_clocks[] = { "otp", "apb_pclk", "phy", "arb", }; @@ -295,6 +303,10 @@ static const struct of_device_id rockchip_otp_match[] = { .compatible = "rockchip,rk3308-otp", .data = &px30_data, }, + { + .compatible = "rockchip,rk3576-otp", + .data = &rk3576_data, + }, { .compatible = "rockchip,rk3588-otp", .data = &rk3588_data, -- cgit v1.2.3 From f487438d370590193c5635ccbae1b1c51c5b273c Mon Sep 17 00:00:00 2001 From: Akhil P Oommen Date: Fri, 11 Apr 2025 12:22:43 +0100 Subject: dt-bindings: nvmem: qfprom: Add X1E80100 compatible Document compatible string for the QFPROM on X1E80100 platform. Signed-off-by: Akhil P Oommen Reviewed-by: Krzysztof Kozlowski Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-6-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml index 39c209249c9c..4c332b44d35e 100644 --- a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml +++ b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml @@ -51,6 +51,7 @@ properties: - qcom,sm8450-qfprom - qcom,sm8550-qfprom - qcom,sm8650-qfprom + - qcom,x1e80100-qfprom - const: qcom,qfprom reg: -- cgit v1.2.3 From 269e074da1882c296085a27f0742c47ac860072e Mon Sep 17 00:00:00 2001 From: Barnabás Czémán Date: Fri, 11 Apr 2025 12:22:44 +0100 Subject: dt-bindings: nvmem: Add compatible for MS8937 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Document the QFPROM block found on MSM8937. Signed-off-by: Barnabás Czémán Reviewed-by: Krzysztof Kozlowski Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-7-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml index 4c332b44d35e..e1a8856ccba4 100644 --- a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml +++ b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml @@ -28,6 +28,7 @@ properties: - qcom,msm8226-qfprom - qcom,msm8916-qfprom - qcom,msm8917-qfprom + - qcom,msm8937-qfprom - qcom,msm8974-qfprom - qcom,msm8976-qfprom - qcom,msm8996-qfprom -- cgit v1.2.3 From eed6d954542fb55c814dd54b7fcc1b515bd76464 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 11 Apr 2025 12:22:45 +0100 Subject: dt-bindings: nvmem: fixed-cell: increase bits start value to 31 If NVMEM uses a data stride bigger than a byte, the starting bit of the cell might be bigger than a byte (e.g. if the data comes in the second byte of the 4-byte word). Allow the staring bit to be 8 or greater to reflect such usecases. Signed-off-by: Dmitry Baryshkov Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-8-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml index 8b3826243ddd..38e3ad50ff4f 100644 --- a/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml +++ b/Documentation/devicetree/bindings/nvmem/layouts/fixed-cell.yaml @@ -27,7 +27,7 @@ properties: $ref: /schemas/types.yaml#/definitions/uint32-array items: - minimum: 0 - maximum: 7 + maximum: 31 description: Offset in bit within the address range specified by reg. - minimum: 1 -- cgit v1.2.3 From 7a06ef75107799675ea6e4d73b9df37e18e352a8 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 11 Apr 2025 12:22:46 +0100 Subject: nvmem: core: fix bit offsets of more than one byte If the NVMEM specifies a stride to access data, reading particular cell might require bit offset that is bigger than one byte. Rework NVMEM core code to support bit offsets of more than 8 bits. Signed-off-by: Dmitry Baryshkov Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-9-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index fff85bbf0ecd..7872903c08a1 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -837,7 +837,9 @@ static int nvmem_add_cells_from_dt(struct nvmem_device *nvmem, struct device_nod if (addr && len == (2 * sizeof(u32))) { info.bit_offset = be32_to_cpup(addr++); info.nbits = be32_to_cpup(addr); - if (info.bit_offset >= BITS_PER_BYTE || info.nbits < 1) { + if (info.bit_offset >= BITS_PER_BYTE * info.bytes || + info.nbits < 1 || + info.bit_offset + info.nbits > BITS_PER_BYTE * info.bytes) { dev_err(dev, "nvmem: invalid bits on %pOF\n", child); of_node_put(child); return -EINVAL; @@ -1630,21 +1632,29 @@ EXPORT_SYMBOL_GPL(nvmem_cell_put); static void nvmem_shift_read_buffer_in_place(struct nvmem_cell_entry *cell, void *buf) { u8 *p, *b; - int i, extra, bit_offset = cell->bit_offset; + int i, extra, bytes_offset; + int bit_offset = cell->bit_offset; p = b = buf; - if (bit_offset) { + + bytes_offset = bit_offset / BITS_PER_BYTE; + b += bytes_offset; + bit_offset %= BITS_PER_BYTE; + + if (bit_offset % BITS_PER_BYTE) { /* First shift */ - *b++ >>= bit_offset; + *p = *b++ >> bit_offset; /* setup rest of the bytes if any */ for (i = 1; i < cell->bytes; i++) { /* Get bits from next byte and shift them towards msb */ - *p |= *b << (BITS_PER_BYTE - bit_offset); + *p++ |= *b << (BITS_PER_BYTE - bit_offset); - p = b; - *b++ >>= bit_offset; + *p = *b++ >> bit_offset; } + } else if (p != b) { + memmove(p, b, cell->bytes - bytes_offset); + p += cell->bytes - 1; } else { /* point to the msb */ p += cell->bytes - 1; -- cgit v1.2.3 From 13bcd440f2ff38cd7e42a179c223d4b833158b33 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 11 Apr 2025 12:22:47 +0100 Subject: nvmem: core: verify cell's raw_len Check that the NVMEM cell's raw_len is a aligned to word_size. Otherwise Otherwise drivers might face incomplete read while accessing the last part of the NVMEM cell. Signed-off-by: Dmitry Baryshkov Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-10-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 7872903c08a1..7b8c85f9e035 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -605,6 +605,18 @@ static int nvmem_cell_info_to_nvmem_cell_entry_nodup(struct nvmem_device *nvmem, return -EINVAL; } + if (!IS_ALIGNED(cell->raw_len, nvmem->word_size)) { + dev_err(&nvmem->dev, + "cell %s raw len %zd unaligned to nvmem word size %d\n", + cell->name ?: "", cell->raw_len, + nvmem->word_size); + + if (info->raw_len) + return -EINVAL; + + cell->raw_len = ALIGN(cell->raw_len, nvmem->word_size); + } + return 0; } -- cgit v1.2.3 From 6786484223d5705bf7f919c1e5055d478ebeec32 Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 11 Apr 2025 12:22:48 +0100 Subject: nvmem: core: update raw_len if the bit reading is required If NVMEM cell uses bit offset or specifies bit truncation, update raw_len manually (following the cell->bytes update), ensuring that the NVMEM access is still word-aligned. Signed-off-by: Dmitry Baryshkov Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-11-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvmem/core.c b/drivers/nvmem/core.c index 7b8c85f9e035..e206efc29a00 100644 --- a/drivers/nvmem/core.c +++ b/drivers/nvmem/core.c @@ -594,9 +594,11 @@ static int nvmem_cell_info_to_nvmem_cell_entry_nodup(struct nvmem_device *nvmem, cell->nbits = info->nbits; cell->np = info->np; - if (cell->nbits) + if (cell->nbits) { cell->bytes = DIV_ROUND_UP(cell->nbits + cell->bit_offset, BITS_PER_BYTE); + cell->raw_len = ALIGN(cell->bytes, nvmem->word_size); + } if (!IS_ALIGNED(cell->offset, nvmem->stride)) { dev_err(&nvmem->dev, -- cgit v1.2.3 From 3566a737db87a9bf360c2fd36433c5149f805f2e Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Fri, 11 Apr 2025 12:22:49 +0100 Subject: nvmem: qfprom: switch to 4-byte aligned reads All platforms since Snapdragon 8 Gen1 (SM8450) require using 4-byte reads to access QFPROM data. While older platforms were more than happy with 1-byte reads, change the qfprom driver to use 4-byte reads for all the platforms. Specify stride and word size of 4 bytes. To retain compatibility with the existing DT and to simplify porting data from vendor kernels, use fixup_dt_cell_info in order to bump alignment requirements. Signed-off-by: Dmitry Baryshkov Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-12-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/nvmem/qfprom.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/nvmem/qfprom.c b/drivers/nvmem/qfprom.c index 116a39e804c7..a872c640b8c5 100644 --- a/drivers/nvmem/qfprom.c +++ b/drivers/nvmem/qfprom.c @@ -321,19 +321,32 @@ static int qfprom_reg_read(void *context, unsigned int reg, void *_val, size_t bytes) { struct qfprom_priv *priv = context; - u8 *val = _val; - int i = 0, words = bytes; + u32 *val = _val; void __iomem *base = priv->qfpcorrected; + int words = DIV_ROUND_UP(bytes, sizeof(u32)); + int i; if (read_raw_data && priv->qfpraw) base = priv->qfpraw; - while (words--) - *val++ = readb(base + reg + i++); + for (i = 0; i < words; i++) + *val++ = readl(base + reg + i * sizeof(u32)); return 0; } +/* Align reads to word boundary */ +static void qfprom_fixup_dt_cell_info(struct nvmem_device *nvmem, + struct nvmem_cell_info *cell) +{ + unsigned int byte_offset = cell->offset % sizeof(u32); + + cell->bit_offset += byte_offset * BITS_PER_BYTE; + cell->offset -= byte_offset; + if (byte_offset && !cell->nbits) + cell->nbits = cell->bytes * BITS_PER_BYTE; +} + static void qfprom_runtime_disable(void *data) { pm_runtime_disable(data); @@ -358,10 +371,11 @@ static int qfprom_probe(struct platform_device *pdev) struct nvmem_config econfig = { .name = "qfprom", .add_legacy_fixed_of_cells = true, - .stride = 1, - .word_size = 1, + .stride = 4, + .word_size = 4, .id = NVMEM_DEVID_AUTO, .reg_read = qfprom_reg_read, + .fixup_dt_cell_info = qfprom_fixup_dt_cell_info, }; struct device *dev = &pdev->dev; struct resource *res; -- cgit v1.2.3 From b78de5c2c60f5f65ce401ca791692409ef9ceaec Mon Sep 17 00:00:00 2001 From: Sricharan Ramabadhran Date: Fri, 11 Apr 2025 12:22:50 +0100 Subject: dt-bindings: nvmem: Add compatible for IPQ5018 Document the QFPROM block found on IPQ5018 Reviewed-by: Krzysztof Kozlowski Signed-off-by: Sricharan Ramabadhran Signed-off-by: George Moussalem Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-13-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml index e1a8856ccba4..2b39d27da57b 100644 --- a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml +++ b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml @@ -19,6 +19,7 @@ properties: - enum: - qcom,apq8064-qfprom - qcom,apq8084-qfprom + - qcom,ipq5018-qfprom - qcom,ipq5332-qfprom - qcom,ipq5424-qfprom - qcom,ipq6018-qfprom -- cgit v1.2.3 From e9a573e2d7c6409519c2aa367d524dba31694f95 Mon Sep 17 00:00:00 2001 From: Rudraksha Gupta Date: Fri, 11 Apr 2025 12:22:51 +0100 Subject: dt-bindings: nvmem: Add compatible for MSM8960 Document the QFPROM on MSM8960. Reviewed-by: Krzysztof Kozlowski Signed-off-by: Rudraksha Gupta Signed-off-by: Srinivas Kandagatla Link: https://lore.kernel.org/r/20250411112251.68002-14-srinivas.kandagatla@linaro.org Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml index 2b39d27da57b..3f6dc6a3a9f1 100644 --- a/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml +++ b/Documentation/devicetree/bindings/nvmem/qcom,qfprom.yaml @@ -30,6 +30,7 @@ properties: - qcom,msm8916-qfprom - qcom,msm8917-qfprom - qcom,msm8937-qfprom + - qcom,msm8960-qfprom - qcom,msm8974-qfprom - qcom,msm8976-qfprom - qcom,msm8996-qfprom -- cgit v1.2.3 From ec27386de23a511008c53aa2f3434ad180a3ca9a Mon Sep 17 00:00:00 2001 From: Andrei Kuchynski Date: Fri, 21 Mar 2025 14:37:26 +0000 Subject: usb: typec: class: Fix NULL pointer access Concurrent calls to typec_partner_unlink_device can lead to a NULL pointer dereference. This patch adds a mutex to protect USB device pointers and prevent this issue. The same mutex protects both the device pointers and the partner device registration. Cc: stable@vger.kernel.org Fixes: 59de2a56d127 ("usb: typec: Link enumerated USB devices with Type-C partner") Signed-off-by: Andrei Kuchynski Reviewed-by: Benson Leung Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250321143728.4092417-2-akuchynski@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 15 +++++++++++++-- drivers/usb/typec/class.h | 1 + 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 9c76c3d0c6cf..eadb150223f8 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -1052,6 +1052,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port, partner->usb_mode = USB_MODE_USB3; } + mutex_lock(&port->partner_link_lock); ret = device_register(&partner->dev); if (ret) { dev_err(&port->dev, "failed to register partner (%d)\n", ret); @@ -1063,6 +1064,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port, typec_partner_link_device(partner, port->usb2_dev); if (port->usb3_dev) typec_partner_link_device(partner, port->usb3_dev); + mutex_unlock(&port->partner_link_lock); return partner; } @@ -1083,12 +1085,14 @@ void typec_unregister_partner(struct typec_partner *partner) port = to_typec_port(partner->dev.parent); + mutex_lock(&port->partner_link_lock); if (port->usb2_dev) typec_partner_unlink_device(partner, port->usb2_dev); if (port->usb3_dev) typec_partner_unlink_device(partner, port->usb3_dev); device_unregister(&partner->dev); + mutex_unlock(&port->partner_link_lock); } EXPORT_SYMBOL_GPL(typec_unregister_partner); @@ -2041,10 +2045,11 @@ static struct typec_partner *typec_get_partner(struct typec_port *port) static void typec_partner_attach(struct typec_connector *con, struct device *dev) { struct typec_port *port = container_of(con, struct typec_port, con); - struct typec_partner *partner = typec_get_partner(port); + struct typec_partner *partner; struct usb_device *udev = to_usb_device(dev); enum usb_mode usb_mode; + mutex_lock(&port->partner_link_lock); if (udev->speed < USB_SPEED_SUPER) { usb_mode = USB_MODE_USB2; port->usb2_dev = dev; @@ -2053,18 +2058,22 @@ static void typec_partner_attach(struct typec_connector *con, struct device *dev port->usb3_dev = dev; } + partner = typec_get_partner(port); if (partner) { typec_partner_set_usb_mode(partner, usb_mode); typec_partner_link_device(partner, dev); put_device(&partner->dev); } + mutex_unlock(&port->partner_link_lock); } static void typec_partner_deattach(struct typec_connector *con, struct device *dev) { struct typec_port *port = container_of(con, struct typec_port, con); - struct typec_partner *partner = typec_get_partner(port); + struct typec_partner *partner; + mutex_lock(&port->partner_link_lock); + partner = typec_get_partner(port); if (partner) { typec_partner_unlink_device(partner, dev); put_device(&partner->dev); @@ -2074,6 +2083,7 @@ static void typec_partner_deattach(struct typec_connector *con, struct device *d port->usb2_dev = NULL; else if (port->usb3_dev == dev) port->usb3_dev = NULL; + mutex_unlock(&port->partner_link_lock); } /** @@ -2614,6 +2624,7 @@ struct typec_port *typec_register_port(struct device *parent, ida_init(&port->mode_ids); mutex_init(&port->port_type_lock); + mutex_init(&port->partner_link_lock); port->id = id; port->ops = cap->ops; diff --git a/drivers/usb/typec/class.h b/drivers/usb/typec/class.h index b3076a24ad2e..db2fe96c48ff 100644 --- a/drivers/usb/typec/class.h +++ b/drivers/usb/typec/class.h @@ -59,6 +59,7 @@ struct typec_port { enum typec_port_type port_type; enum usb_mode usb_mode; struct mutex port_type_lock; + struct mutex partner_link_lock; enum typec_orientation orientation; struct typec_switch *sw; -- cgit v1.2.3 From 66e1a887273c6b89f09bc11a40d0a71d5a081a8e Mon Sep 17 00:00:00 2001 From: Andrei Kuchynski Date: Fri, 21 Mar 2025 14:37:27 +0000 Subject: usb: typec: class: Invalidate USB device pointers on partner unregistration To avoid using invalid USB device pointers after a Type-C partner disconnects, this patch clears the pointers upon partner unregistration. This ensures a clean state for future connections. Cc: stable@vger.kernel.org Fixes: 59de2a56d127 ("usb: typec: Link enumerated USB devices with Type-C partner") Signed-off-by: Andrei Kuchynski Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20250321143728.4092417-3-akuchynski@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index eadb150223f8..3df3e3736916 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -1086,10 +1086,14 @@ void typec_unregister_partner(struct typec_partner *partner) port = to_typec_port(partner->dev.parent); mutex_lock(&port->partner_link_lock); - if (port->usb2_dev) + if (port->usb2_dev) { typec_partner_unlink_device(partner, port->usb2_dev); - if (port->usb3_dev) + port->usb2_dev = NULL; + } + if (port->usb3_dev) { typec_partner_unlink_device(partner, port->usb3_dev); + port->usb3_dev = NULL; + } device_unregister(&partner->dev); mutex_unlock(&port->partner_link_lock); -- cgit v1.2.3 From 4e28f79e3dffa52d327b46d1a78dac16efb5810b Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sun, 16 Mar 2025 13:26:54 +0300 Subject: usb: chipidea: ci_hdrc_imx: fix usbmisc handling usbmisc is an optional device property so it is totally valid for the corresponding data->usbmisc_data to have a NULL value. Check that before dereferencing the pointer. Found by Linux Verification Center (linuxtesting.org) with Svace static analysis tool. Fixes: 74adad500346 ("usb: chipidea: ci_hdrc_imx: decrement device's refcount in .remove() and in the error path of .probe()") Cc: stable Signed-off-by: Fedor Pchelkin Acked-by: Peter Chen Link: https://lore.kernel.org/r/20250316102658.490340-2-pchelkin@ispras.ru Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/ci_hdrc_imx.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index 1a7fc638213e..619779eef333 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -534,7 +534,8 @@ disable_hsic_regulator: cpu_latency_qos_remove_request(&data->pm_qos_req); data->ci_pdev = NULL; err_put: - put_device(data->usbmisc_data->dev); + if (data->usbmisc_data) + put_device(data->usbmisc_data->dev); return ret; } @@ -559,7 +560,8 @@ static void ci_hdrc_imx_remove(struct platform_device *pdev) if (data->hsic_pad_regulator) regulator_disable(data->hsic_pad_regulator); } - put_device(data->usbmisc_data->dev); + if (data->usbmisc_data) + put_device(data->usbmisc_data->dev); } static void ci_hdrc_imx_shutdown(struct platform_device *pdev) -- cgit v1.2.3 From 8cab0e9a3f3e8d700179e0d6141643d54a267fd5 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sun, 16 Mar 2025 13:26:55 +0300 Subject: usb: chipidea: ci_hdrc_imx: fix call balance of regulator routines Upon encountering errors during the HSIC pinctrl handling section the regulator should be disabled. Use devm_add_action_or_reset() to let the regulator-disabling routine be handled by device resource management stack. Found by Linux Verification Center (linuxtesting.org). Fixes: 4d6141288c33 ("usb: chipidea: imx: pinctrl for HSIC is optional") Cc: stable Signed-off-by: Fedor Pchelkin Acked-by: Peter Chen Link: https://lore.kernel.org/r/20250316102658.490340-3-pchelkin@ispras.ru Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/ci_hdrc_imx.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index 619779eef333..d942b3c72640 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -336,6 +336,13 @@ static int ci_hdrc_imx_notify_event(struct ci_hdrc *ci, unsigned int event) return ret; } +static void ci_hdrc_imx_disable_regulator(void *arg) +{ + struct ci_hdrc_imx_data *data = arg; + + regulator_disable(data->hsic_pad_regulator); +} + static int ci_hdrc_imx_probe(struct platform_device *pdev) { struct ci_hdrc_imx_data *data; @@ -394,6 +401,13 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) "Failed to enable HSIC pad regulator\n"); goto err_put; } + ret = devm_add_action_or_reset(dev, + ci_hdrc_imx_disable_regulator, data); + if (ret) { + dev_err(dev, + "Failed to add regulator devm action\n"); + goto err_put; + } } } @@ -432,11 +446,11 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) ret = imx_get_clks(dev); if (ret) - goto disable_hsic_regulator; + goto qos_remove_request; ret = imx_prepare_enable_clks(dev); if (ret) - goto disable_hsic_regulator; + goto qos_remove_request; ret = clk_prepare_enable(data->clk_wakeup); if (ret) @@ -526,10 +540,7 @@ err_clk: clk_disable_unprepare(data->clk_wakeup); err_wakeup_clk: imx_disable_unprepare_clks(dev); -disable_hsic_regulator: - if (data->hsic_pad_regulator) - /* don't overwrite original ret (cf. EPROBE_DEFER) */ - regulator_disable(data->hsic_pad_regulator); +qos_remove_request: if (pdata.flags & CI_HDRC_PMQOS) cpu_latency_qos_remove_request(&data->pm_qos_req); data->ci_pdev = NULL; @@ -557,8 +568,6 @@ static void ci_hdrc_imx_remove(struct platform_device *pdev) clk_disable_unprepare(data->clk_wakeup); if (data->plat_data->flags & CI_HDRC_PMQOS) cpu_latency_qos_remove_request(&data->pm_qos_req); - if (data->hsic_pad_regulator) - regulator_disable(data->hsic_pad_regulator); } if (data->usbmisc_data) put_device(data->usbmisc_data->dev); -- cgit v1.2.3 From 8c531e0a8c2d82509ad97c6d3a1e6217c7ed136d Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Sun, 16 Mar 2025 13:26:56 +0300 Subject: usb: chipidea: ci_hdrc_imx: implement usb_phy_init() error handling usb_phy_init() may return an error code if e.g. its implementation fails to prepare/enable some clocks. And properly rollback on probe error path by calling the counterpart usb_phy_shutdown(). Found by Linux Verification Center (linuxtesting.org). Fixes: be9cae2479f4 ("usb: chipidea: imx: Fix ULPI on imx53") Cc: stable Signed-off-by: Fedor Pchelkin Acked-by: Peter Chen Link: https://lore.kernel.org/r/20250316102658.490340-4-pchelkin@ispras.ru Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/ci_hdrc_imx.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/usb/chipidea/ci_hdrc_imx.c b/drivers/usb/chipidea/ci_hdrc_imx.c index d942b3c72640..4f8bfd242b59 100644 --- a/drivers/usb/chipidea/ci_hdrc_imx.c +++ b/drivers/usb/chipidea/ci_hdrc_imx.c @@ -484,7 +484,11 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) of_usb_get_phy_mode(np) == USBPHY_INTERFACE_MODE_ULPI) { pdata.flags |= CI_HDRC_OVERRIDE_PHY_CONTROL; data->override_phy_control = true; - usb_phy_init(pdata.usb_phy); + ret = usb_phy_init(pdata.usb_phy); + if (ret) { + dev_err(dev, "Failed to init phy\n"); + goto err_clk; + } } if (pdata.flags & CI_HDRC_SUPPORTS_RUNTIME_PM) @@ -493,7 +497,7 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) ret = imx_usbmisc_init(data->usbmisc_data); if (ret) { dev_err(dev, "usbmisc init failed, ret=%d\n", ret); - goto err_clk; + goto phy_shutdown; } data->ci_pdev = ci_hdrc_add_device(dev, @@ -502,7 +506,7 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) if (IS_ERR(data->ci_pdev)) { ret = PTR_ERR(data->ci_pdev); dev_err_probe(dev, ret, "ci_hdrc_add_device failed\n"); - goto err_clk; + goto phy_shutdown; } if (data->usbmisc_data) { @@ -536,6 +540,9 @@ static int ci_hdrc_imx_probe(struct platform_device *pdev) disable_device: ci_hdrc_remove_device(data->ci_pdev); +phy_shutdown: + if (data->override_phy_control) + usb_phy_shutdown(data->phy); err_clk: clk_disable_unprepare(data->clk_wakeup); err_wakeup_clk: -- cgit v1.2.3 From a1059896f2bfdcebcdc7153c3be2307ea319501f Mon Sep 17 00:00:00 2001 From: Ralph Siemsen Date: Tue, 18 Mar 2025 11:09:32 -0400 Subject: usb: cdns3: Fix deadlock when using NCM gadget The cdns3 driver has the same NCM deadlock as fixed in cdnsp by commit 58f2fcb3a845 ("usb: cdnsp: Fix deadlock issue during using NCM gadget"). Under PREEMPT_RT the deadlock can be readily triggered by heavy network traffic, for example using "iperf --bidir" over NCM ethernet link. The deadlock occurs because the threaded interrupt handler gets preempted by a softirq, but both are protected by the same spinlock. Prevent deadlock by disabling softirq during threaded irq handler. Cc: stable Fixes: 7733f6c32e36 ("usb: cdns3: Add Cadence USB3 DRD Driver") Signed-off-by: Ralph Siemsen Acked-by: Peter Chen Reviewed-by: Sebastian Andrzej Siewior Link: https://lore.kernel.org/r/20250318-rfs-cdns3-deadlock-v2-1-bfd9cfcee732@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdns3-gadget.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/cdns3/cdns3-gadget.c b/drivers/usb/cdns3/cdns3-gadget.c index 694aa1457739..d9d8dc05b235 100644 --- a/drivers/usb/cdns3/cdns3-gadget.c +++ b/drivers/usb/cdns3/cdns3-gadget.c @@ -1963,6 +1963,7 @@ static irqreturn_t cdns3_device_thread_irq_handler(int irq, void *data) unsigned int bit; unsigned long reg; + local_bh_disable(); spin_lock_irqsave(&priv_dev->lock, flags); reg = readl(&priv_dev->regs->usb_ists); @@ -2004,6 +2005,7 @@ static irqreturn_t cdns3_device_thread_irq_handler(int irq, void *data) irqend: writel(~0, &priv_dev->regs->ep_ien); spin_unlock_irqrestore(&priv_dev->lock, flags); + local_bh_enable(); return ret; } -- cgit v1.2.3 From 38d6e60b6f3a99f8f13bee22eab616136c2c0675 Mon Sep 17 00:00:00 2001 From: Mike Looijmans Date: Tue, 18 Mar 2025 07:44:52 +0100 Subject: usb: dwc3: xilinx: Prevent spike in reset signal The "reset" GPIO controls the RESET signal to an external, usually ULPI PHY, chip. The original code path acquires the signal in LOW state, and then immediately asserts it HIGH again, if the reset signal defaulted to asserted, there'd be a short "spike" before the reset. Here is what happens depending on the pre-existing state of the reset signal: Reset (previously asserted): ~~~|_|~~~~|_______ Reset (previously deasserted): _____|~~~~|_______ ^ ^ ^ A B C At point A, the low going transition is because the reset line is requested using GPIOD_OUT_LOW. If the line is successfully requested, the first thing we do is set it high _without_ any delay. This is point B. So, a glitch occurs between A and B. Requesting the line using GPIOD_OUT_HIGH eliminates the A and B transitions. Instead we get: Reset (previously asserted) : ~~~~~~~~~~|______ Reset (previously deasserted): ____|~~~~~|______ ^ ^ A C Where A and C are the points described above in the code. Point B has been eliminated. The issue was found during code inspection. Also remove the cryptic "toggle ulpi .." comment. Fixes: ca05b38252d7 ("usb: dwc3: xilinx: Add gpio-reset support") Cc: stable Signed-off-by: Mike Looijmans Reviewed-by: Radhey Shyam Pandey Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20250318064518.9320-1-mike.looijmans@topic.nl Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/dwc3-xilinx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/usb/dwc3/dwc3-xilinx.c b/drivers/usb/dwc3/dwc3-xilinx.c index a33a42ba0249..4ca7f6240d07 100644 --- a/drivers/usb/dwc3/dwc3-xilinx.c +++ b/drivers/usb/dwc3/dwc3-xilinx.c @@ -207,15 +207,13 @@ static int dwc3_xlnx_init_zynqmp(struct dwc3_xlnx *priv_data) skip_usb3_phy: /* ulpi reset via gpio-modepin or gpio-framework driver */ - reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW); + reset_gpio = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH); if (IS_ERR(reset_gpio)) { return dev_err_probe(dev, PTR_ERR(reset_gpio), "Failed to request reset GPIO\n"); } if (reset_gpio) { - /* Toggle ulpi to reset the phy. */ - gpiod_set_value_cansleep(reset_gpio, 1); usleep_range(5000, 10000); gpiod_set_value_cansleep(reset_gpio, 0); usleep_range(5000, 10000); -- cgit v1.2.3 From bcb60d438547355b8f9ad48645909139b64d3482 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 28 Mar 2025 12:00:59 +0800 Subject: USB: OHCI: Add quirk for LS7A OHCI controller (rev 0x02) The OHCI controller (rev 0x02) under LS7A PCI host has a hardware flaw. MMIO register with offset 0x60/0x64 is treated as legacy PS2-compatible keyboard/mouse interface, which confuse the OHCI controller. Since OHCI only use a 4KB BAR resource indeed, the LS7A OHCI controller's 32KB BAR is wrapped around (the second 4KB BAR space is the same as the first 4KB internally). So we can add an 4KB offset (0x1000) to the OHCI registers (from the PCI BAR resource) as a quirk. Cc: stable Suggested-by: Bjorn Helgaas Reviewed-by: Alan Stern Tested-by: Mingcong Bai Signed-off-by: Huacai Chen Link: https://lore.kernel.org/r/20250328040059.3672979-1-chenhuacai@loongson.cn Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/ohci-pci.c | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/drivers/usb/host/ohci-pci.c b/drivers/usb/host/ohci-pci.c index 900ea0d368e0..9f0a6b27e47c 100644 --- a/drivers/usb/host/ohci-pci.c +++ b/drivers/usb/host/ohci-pci.c @@ -165,6 +165,25 @@ static int ohci_quirk_amd700(struct usb_hcd *hcd) return 0; } +static int ohci_quirk_loongson(struct usb_hcd *hcd) +{ + struct pci_dev *pdev = to_pci_dev(hcd->self.controller); + + /* + * Loongson's LS7A OHCI controller (rev 0x02) has a + * flaw. MMIO register with offset 0x60/64 is treated + * as legacy PS2-compatible keyboard/mouse interface. + * Since OHCI only use 4KB BAR resource, LS7A OHCI's + * 32KB BAR is wrapped around (the 2nd 4KB BAR space + * is the same as the 1st 4KB internally). So add 4KB + * offset (0x1000) to the OHCI registers as a quirk. + */ + if (pdev->revision == 0x2) + hcd->regs += SZ_4K; /* SZ_4K = 0x1000 */ + + return 0; +} + static int ohci_quirk_qemu(struct usb_hcd *hcd) { struct ohci_hcd *ohci = hcd_to_ohci(hcd); @@ -224,6 +243,10 @@ static const struct pci_device_id ohci_pci_quirks[] = { PCI_DEVICE(PCI_VENDOR_ID_ATI, 0x4399), .driver_data = (unsigned long)ohci_quirk_amd700, }, + { + PCI_DEVICE(PCI_VENDOR_ID_LOONGSON, 0x7a24), + .driver_data = (unsigned long)ohci_quirk_loongson, + }, { .vendor = PCI_VENDOR_ID_APPLE, .device = 0x003f, -- cgit v1.2.3 From 2932b6b547ec36ad2ed60fbf2117c0e46bb7d40a Mon Sep 17 00:00:00 2001 From: Miao Li Date: Tue, 1 Apr 2025 10:30:27 +0800 Subject: usb: quirks: add DELAY_INIT quirk for Silicon Motion Flash Drive Silicon Motion Flash Drive connects to Huawei hisi platforms and performs a system reboot test for two thousand circles, it will randomly work incorrectly on boot, set DELAY_INIT quirk can workaround this issue. Signed-off-by: Miao Li Cc: stable Link: https://lore.kernel.org/r/20250401023027.44894-1-limiao870622@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 8efbacc5bc34..61583a1b7ac6 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -383,6 +383,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0904, 0x6103), .driver_info = USB_QUIRK_LINEAR_FRAME_INTR_BINTERVAL }, + /* Silicon Motion Flash Drive */ + { USB_DEVICE(0x090c, 0x1000), .driver_info = USB_QUIRK_DELAY_INIT }, + /* Sound Devices USBPre2 */ { USB_DEVICE(0x0926, 0x0202), .driver_info = USB_QUIRK_ENDPOINT_IGNORE }, -- cgit v1.2.3 From 9ab75eee1a056f896b87d139044dd103adc532b9 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Thu, 3 Apr 2025 19:59:45 +0200 Subject: USB: storage: quirk for ADATA Portable HDD CH94 Version 1.60 specifically needs this quirk. Version 2.00 is known good. Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250403180004.343133-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_uas.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_uas.h b/drivers/usb/storage/unusual_uas.h index 1f8c9b16a0fb..d460d71b4257 100644 --- a/drivers/usb/storage/unusual_uas.h +++ b/drivers/usb/storage/unusual_uas.h @@ -83,6 +83,13 @@ UNUSUAL_DEV(0x0bc2, 0x331a, 0x0000, 0x9999, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_NO_REPORT_LUNS), +/* Reported-by: Oliver Neukum */ +UNUSUAL_DEV(0x125f, 0xa94a, 0x0160, 0x0160, + "ADATA", + "Portable HDD CH94", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_ATA_1X), + /* Reported-by: Benjamin Tissoires */ UNUSUAL_DEV(0x13fd, 0x3940, 0x0000, 0x9999, "Initio Corporation", -- cgit v1.2.3 From 63ccd26cd1f6600421795f6ca3e625076be06c9f Mon Sep 17 00:00:00 2001 From: Frode Isaksen Date: Thu, 3 Apr 2025 09:28:03 +0200 Subject: usb: dwc3: gadget: check that event count does not exceed event buffer length The event count is read from register DWC3_GEVNTCOUNT. There is a check for the count being zero, but not for exceeding the event buffer length. Check that event count does not exceed event buffer length, avoiding an out-of-bounds access when memcpy'ing the event. Crash log: Unable to handle kernel paging request at virtual address ffffffc0129be000 pc : __memcpy+0x114/0x180 lr : dwc3_check_event_buf+0xec/0x348 x3 : 0000000000000030 x2 : 000000000000dfc4 x1 : ffffffc0129be000 x0 : ffffff87aad60080 Call trace: __memcpy+0x114/0x180 dwc3_interrupt+0x24/0x34 Signed-off-by: Frode Isaksen Fixes: 72246da40f37 ("usb: Introduce DesignWare USB3 DRD Driver") Cc: stable Acked-by: Thinh Nguyen Link: https://lore.kernel.org/r/20250403072907.448524-1-fisaksen@baylibre.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/dwc3/gadget.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index 47e73c4ed62d..8c30d86cc4e3 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -4617,6 +4617,12 @@ static irqreturn_t dwc3_check_event_buf(struct dwc3_event_buffer *evt) if (!count) return IRQ_NONE; + if (count > evt->length) { + dev_err_ratelimited(dwc->dev, "invalid count(%u) > evt->length(%u)\n", + count, evt->length); + return IRQ_NONE; + } + evt->count = count; evt->flags |= DWC3_EVENT_PENDING; -- cgit v1.2.3 From e00b39a4f3552c730f1e24c8d62c4a8c6aad4e5d Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 8 Apr 2025 15:57:46 +0200 Subject: USB: VLI disk crashes if LPM is used This device needs the NO_LPM quirk. Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250408135800.792515-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 61583a1b7ac6..87b48c176160 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -542,6 +542,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x2040, 0x7200), .driver_info = USB_QUIRK_CONFIG_INTF_STRINGS }, + /* VLI disk */ + { USB_DEVICE(0x2109, 0x0711), .driver_info = USB_QUIRK_NO_LPM }, + /* Raydium Touchscreen */ { USB_DEVICE(0x2386, 0x3114), .driver_info = USB_QUIRK_NO_LPM }, -- cgit v1.2.3 From 9697f5efcf5fdea65b8390b5eb81bebe746ceedc Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 1 Apr 2025 10:45:38 +0200 Subject: USB: wdm: handle IO errors in wdm_wwan_port_start In case submitting the URB fails we must undo what we've done so far. Fixes: cac6fb015f71 ("usb: class: cdc-wdm: WWAN framework integration") Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250401084749.175246-2-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 86ee39db013f..eb86d1c9b4a4 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -829,6 +829,7 @@ static struct usb_class_driver wdm_class = { static int wdm_wwan_port_start(struct wwan_port *port) { struct wdm_device *desc = wwan_port_get_drvdata(port); + int rv; /* The interface is both exposed via the WWAN framework and as a * legacy usbmisc chardev. If chardev is already open, just fail @@ -848,7 +849,15 @@ static int wdm_wwan_port_start(struct wwan_port *port) wwan_port_txon(port); /* Start getting events */ - return usb_submit_urb(desc->validity, GFP_KERNEL); + rv = usb_submit_urb(desc->validity, GFP_KERNEL); + if (rv < 0) { + wwan_port_txoff(port); + desc->manage_power(desc->intf, 0); + /* this must be last lest we race with chardev open */ + clear_bit(WDM_WWAN_IN_USE, &desc->flags); + } + + return rv; } static void wdm_wwan_port_stop(struct wwan_port *port) -- cgit v1.2.3 From c1846ed4eb527bdfe6b3b7dd2c78e2af4bf98f4f Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 1 Apr 2025 10:45:39 +0200 Subject: USB: wdm: close race between wdm_open and wdm_wwan_port_stop Clearing WDM_WWAN_IN_USE must be the last action or we can open a chardev whose URBs are still poisoned Fixes: cac6fb015f71 ("usb: class: cdc-wdm: WWAN framework integration") Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250401084749.175246-3-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index eb86d1c9b4a4..9c686751ddc1 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -726,7 +726,7 @@ static int wdm_open(struct inode *inode, struct file *file) rv = -EBUSY; goto out; } - + smp_rmb(); /* ordered against wdm_wwan_port_stop() */ rv = usb_autopm_get_interface(desc->intf); if (rv < 0) { dev_err(&desc->intf->dev, "Error autopm - %d\n", rv); @@ -868,8 +868,10 @@ static void wdm_wwan_port_stop(struct wwan_port *port) poison_urbs(desc); desc->manage_power(desc->intf, 0); clear_bit(WDM_READ, &desc->flags); - clear_bit(WDM_WWAN_IN_USE, &desc->flags); unpoison_urbs(desc); + smp_wmb(); /* ordered against wdm_open() */ + /* this must be last lest we open a poisoned device */ + clear_bit(WDM_WWAN_IN_USE, &desc->flags); } static void wdm_wwan_port_tx_complete(struct urb *urb) -- cgit v1.2.3 From 1fdc4dca350c0b8ada0b8ebf212504e1ad55e511 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 1 Apr 2025 10:45:40 +0200 Subject: USB: wdm: wdm_wwan_port_tx_complete mutex in atomic context wdm_wwan_port_tx_complete is called from a completion handler with irqs disabled and possible in IRQ context usb_autopm_put_interface can take a mutex. Hence usb_autopm_put_interface_async must be used. Fixes: cac6fb015f71 ("usb: class: cdc-wdm: WWAN framework integration") Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250401084749.175246-4-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index 9c686751ddc1..fc34a5412690 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -879,7 +879,7 @@ static void wdm_wwan_port_tx_complete(struct urb *urb) struct sk_buff *skb = urb->context; struct wdm_device *desc = skb_shinfo(skb)->destructor_arg; - usb_autopm_put_interface(desc->intf); + usb_autopm_put_interface_async(desc->intf); wwan_port_txon(desc->wwanp); kfree_skb(skb); } -- cgit v1.2.3 From 73e9cc1ffd3650b12c4eb059dfdafd56e725ceda Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Tue, 1 Apr 2025 10:45:41 +0200 Subject: USB: wdm: add annotation This is not understandable without a comment on endianness Fixes: afba937e540c9 ("USB: CDC WDM driver") Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250401084749.175246-5-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/cdc-wdm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/class/cdc-wdm.c b/drivers/usb/class/cdc-wdm.c index fc34a5412690..16e7fa4d488d 100644 --- a/drivers/usb/class/cdc-wdm.c +++ b/drivers/usb/class/cdc-wdm.c @@ -909,7 +909,7 @@ static int wdm_wwan_port_tx(struct wwan_port *port, struct sk_buff *skb) req->bRequestType = (USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_INTERFACE); req->bRequest = USB_CDC_SEND_ENCAPSULATED_COMMAND; req->wValue = 0; - req->wIndex = desc->inum; + req->wIndex = desc->inum; /* already converted */ req->wLength = cpu_to_le16(skb->len); skb_shinfo(skb)->destructor_arg = desc; -- cgit v1.2.3 From 7094832b5ac861b0bd7ed8866c93cb15ef619996 Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Tue, 8 Apr 2025 19:22:47 +0200 Subject: serial: msm: Configure correct working mode before starting earlycon The MSM UART DM controller supports different working modes, e.g. DMA or the "single-character mode", where all reads/writes operate on a single character rather than 4 chars (32-bit) at once. When using earlycon, __msm_console_write() always writes 4 characters at a time, but we don't know which mode the bootloader was using and we don't set the mode either. This causes garbled output if the bootloader was using the single-character mode, because only every 4th character appears in the serial console, e.g. "[ 00oni pi 000xf0[ 00i s 5rm9(l)l s 1 1 SPMTA 7:C 5[ 00A ade k d[ 00ano:ameoi .Q1B[ 00ac _idaM00080oo'" If the bootloader was using the DMA ("DM") mode, output would likely fail entirely. Later, when the full serial driver probes, the port is re-initialized and output works as expected. Fix this also for earlycon by clearing the DMEN register and reset+re-enable the transmitter to apply the change. This ensures the transmitter is in the expected state before writing any output. Cc: stable Fixes: 0efe72963409 ("tty: serial: msm: Add earlycon support") Signed-off-by: Stephan Gerhold Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250408-msm-serial-earlycon-v1-1-429080127530@linaro.org Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/msm_serial.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/tty/serial/msm_serial.c b/drivers/tty/serial/msm_serial.c index 1b137e068444..3449945493ce 100644 --- a/drivers/tty/serial/msm_serial.c +++ b/drivers/tty/serial/msm_serial.c @@ -1746,6 +1746,12 @@ msm_serial_early_console_setup_dm(struct earlycon_device *device, if (!device->port.membase) return -ENODEV; + /* Disable DM / single-character modes */ + msm_write(&device->port, 0, UARTDM_DMEN); + msm_write(&device->port, MSM_UART_CR_CMD_RESET_RX, MSM_UART_CR); + msm_write(&device->port, MSM_UART_CR_CMD_RESET_TX, MSM_UART_CR); + msm_write(&device->port, MSM_UART_CR_TX_ENABLE, MSM_UART_CR); + device->con->write = msm_serial_early_write_dm; return 0; } -- cgit v1.2.3 From ee6a44da3c87cf64d67dd02be8c0127a5bf56175 Mon Sep 17 00:00:00 2001 From: Günther Noack Date: Fri, 11 Apr 2025 09:01:45 +0200 Subject: tty: Require CAP_SYS_ADMIN for all usages of TIOCL_SELMOUSEREPORT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This requirement was overeagerly loosened in commit 2f83e38a095f ("tty: Permit some TIOCL_SETSEL modes without CAP_SYS_ADMIN"), but as it turns out, (1) the logic I implemented there was inconsistent (apologies!), (2) TIOCL_SELMOUSEREPORT might actually be a small security risk after all, and (3) TIOCL_SELMOUSEREPORT is only meant to be used by the mouse daemon (GPM or Consolation), which runs as CAP_SYS_ADMIN already. In more detail: 1. The previous patch has inconsistent logic: In commit 2f83e38a095f ("tty: Permit some TIOCL_SETSEL modes without CAP_SYS_ADMIN"), we checked for sel_mode == TIOCL_SELMOUSEREPORT, but overlooked that the lower four bits of this "mode" parameter were actually used as an additional way to pass an argument. So the patch did actually still require CAP_SYS_ADMIN, if any of the mouse button bits are set, but did not require it if none of the mouse buttons bits are set. This logic is inconsistent and was not intentional. We should have the same policies for using TIOCL_SELMOUSEREPORT independent of the value of the "hidden" mouse button argument. I sent a separate documentation patch to the man page list with more details on TIOCL_SELMOUSEREPORT: https://lore.kernel.org/all/20250223091342.35523-2-gnoack3000@gmail.com/ 2. TIOCL_SELMOUSEREPORT is indeed a potential security risk which can let an attacker simulate "keyboard" input to command line applications on the same terminal, like TIOCSTI and some other TIOCLINUX "selection mode" IOCTLs. By enabling mouse reporting on a terminal and then injecting mouse reports through TIOCL_SELMOUSEREPORT, an attacker can simulate mouse movements on the same terminal, similar to the TIOCSTI keystroke injection attacks that were previously possible with TIOCSTI and other TIOCL_SETSEL selection modes. Many programs (including libreadline/bash) are then prone to misinterpret these mouse reports as normal keyboard input because they do not expect input in the X11 mouse protocol form. The attacker does not have complete control over the escape sequence, but they can at least control the values of two consecutive bytes in the binary mouse reporting escape sequence. I went into more detail on that in the discussion at https://lore.kernel.org/all/20250221.0a947528d8f3@gnoack.org/ It is not equally trivial to simulate arbitrary keystrokes as it was with TIOCSTI (commit 83efeeeb3d04 ("tty: Allow TIOCSTI to be disabled")), but the general mechanism is there, and together with the small number of existing legit use cases (see below), it would be better to revert back to requiring CAP_SYS_ADMIN for TIOCL_SELMOUSEREPORT, as it was already the case before commit 2f83e38a095f ("tty: Permit some TIOCL_SETSEL modes without CAP_SYS_ADMIN"). 3. TIOCL_SELMOUSEREPORT is only used by the mouse daemons (GPM or Consolation), and they are the only legit use case: To quote console_codes(4): The mouse tracking facility is intended to return xterm(1)-compatible mouse status reports. Because the console driver has no way to know the device or type of the mouse, these reports are returned in the console input stream only when the virtual terminal driver receives a mouse update ioctl. These ioctls must be generated by a mouse-aware user-mode application such as the gpm(8) daemon. Jared Finder has also confirmed in https://lore.kernel.org/all/491f3df9de6593df8e70dbe77614b026@finder.org/ that Emacs does not call TIOCL_SELMOUSEREPORT directly, and it would be difficult to find good reasons for doing that, given that it would interfere with the reports that GPM is sending. More information on the interaction between GPM, terminals and the kernel with additional pointers is also available in this patch: https://lore.kernel.org/all/a773e48920aa104a65073671effbdee665c105fc.1603963593.git.tammo.block@gmail.com/ For background on who else uses TIOCL_SELMOUSEREPORT: Debian Code search finds one page of results, the only two known callers are the two mouse daemons GPM and Consolation. (GPM does not show up in the search results because it uses literal numbers to refer to TIOCLINUX-related enums. I looked through GPM by hand instead. TIOCL_SELMOUSEREPORT is also not used from libgpm.) https://codesearch.debian.net/search?q=TIOCL_SELMOUSEREPORT Cc: Jared Finder Cc: Jann Horn Cc: Hanno Böck Cc: Jiri Slaby Cc: Kees Cook Cc: stable Fixes: 2f83e38a095f ("tty: Permit some TIOCL_SETSEL modes without CAP_SYS_ADMIN") Signed-off-by: Günther Noack Link: https://lore.kernel.org/r/20250411070144.3959-2-gnoack3000@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/vt/selection.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/tty/vt/selection.c b/drivers/tty/vt/selection.c index 0bd6544e30a6..791e2f1f7c0b 100644 --- a/drivers/tty/vt/selection.c +++ b/drivers/tty/vt/selection.c @@ -193,13 +193,12 @@ int set_selection_user(const struct tiocl_selection __user *sel, return -EFAULT; /* - * TIOCL_SELCLEAR, TIOCL_SELPOINTER and TIOCL_SELMOUSEREPORT are OK to - * use without CAP_SYS_ADMIN as they do not modify the selection. + * TIOCL_SELCLEAR and TIOCL_SELPOINTER are OK to use without + * CAP_SYS_ADMIN as they do not modify the selection. */ switch (v.sel_mode) { case TIOCL_SELCLEAR: case TIOCL_SELPOINTER: - case TIOCL_SELMOUSEREPORT: break; default: if (!capable(CAP_SYS_ADMIN)) -- cgit v1.2.3 From 4c324085062919d4e21c69e5e78456dcec0052fe Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Wed, 9 Apr 2025 19:13:20 -0500 Subject: scsi: ufs: mcq: Add NULL check in ufshcd_mcq_abort() A race can occur between the MCQ completion path and the abort handler: once a request completes, __blk_mq_free_request() sets rq->mq_hctx to NULL, meaning the subsequent ufshcd_mcq_req_to_hwq() call in ufshcd_mcq_abort() can return a NULL pointer. If this NULL pointer is dereferenced, the kernel will crash. Add a NULL check for the returned hwq pointer. If hwq is NULL, log an error and return FAILED, preventing a potential NULL-pointer dereference. As suggested by Bart, the ufshcd_cmd_inflight() check is removed. This is similar to the fix in commit 74736103fb41 ("scsi: ufs: core: Fix ufshcd_abort_one racing issue"). This is found by our static analysis tool KNighter. Signed-off-by: Chenyuan Yang Link: https://lore.kernel.org/r/20250410001320.2219341-1-chenyuan0y@gmail.com Fixes: f1304d442077 ("scsi: ufs: mcq: Added ufshcd_mcq_abort()") Reviewed-by: Bart Van Assche Reviewed-by: Peter Wang Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufs-mcq.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/ufs/core/ufs-mcq.c b/drivers/ufs/core/ufs-mcq.c index 240ce135bbfb..f1294c29f484 100644 --- a/drivers/ufs/core/ufs-mcq.c +++ b/drivers/ufs/core/ufs-mcq.c @@ -677,13 +677,6 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) unsigned long flags; int err; - if (!ufshcd_cmd_inflight(lrbp->cmd)) { - dev_err(hba->dev, - "%s: skip abort. cmd at tag %d already completed.\n", - __func__, tag); - return FAILED; - } - /* Skip task abort in case previous aborts failed and report failure */ if (lrbp->req_abort_skip) { dev_err(hba->dev, "%s: skip abort. tag %d failed earlier\n", @@ -692,6 +685,11 @@ int ufshcd_mcq_abort(struct scsi_cmnd *cmd) } hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd)); + if (!hwq) { + dev_err(hba->dev, "%s: skip abort. cmd at tag %d already completed.\n", + __func__, tag); + return FAILED; + } if (ufshcd_mcq_sqe_search(hba, hwq, tag)) { /* -- cgit v1.2.3 From cdd445258db9919e9dde497a6d5c3477ea7faf4d Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Fri, 11 Apr 2025 16:44:18 +0530 Subject: scsi: mpi3mr: Fix pending I/O counter Commit 199510e33dea ("scsi: mpi3mr: Update consumer index of reply queues after every 100 replies") introduced a regression with the per-reply queue pending I/O counter which was erroneously decremented, leading to the counter going negative. Drop the incorrect atomic decrement for the pending I/O counter. Fixes: 199510e33dea ("scsi: mpi3mr: Update consumer index of reply queues after every 100 replies") Cc: stable@vger.kernel.org Co-developed-by: Sathya Prakash Signed-off-by: Sathya Prakash Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20250411111419.135485-2-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 3fcb1ad3b070..d6e402aacb2a 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -565,7 +565,7 @@ int mpi3mr_process_op_reply_q(struct mpi3mr_ioc *mrioc, WRITE_ONCE(op_req_q->ci, le16_to_cpu(reply_desc->request_queue_ci)); mpi3mr_process_op_reply_desc(mrioc, reply_desc, &reply_dma, reply_qidx); - atomic_dec(&op_reply_q->pend_ios); + if (reply_dma) mpi3mr_repost_reply_buf(mrioc, reply_dma); num_op_reply++; -- cgit v1.2.3 From 3b5091fee49ffcee512901318ca2425bb1e31a5c Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Fri, 11 Apr 2025 16:44:19 +0530 Subject: scsi: mpi3mr: Reset the pending interrupt flag If an admin interrupt is missed, admin_pend_isr may stay set and trigger admin reply processing even when no admin I/Os are pending. Clearing/Resetting it in the admin completion path prevents this. Fixes: ca41929b2ed5 ("scsi: mpi3mr: Check admin reply queue from Watchdog") Cc: stable@vger.kernel.org Co-developed-by: Sathya Prakash Signed-off-by: Sathya Prakash Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20250411111419.135485-3-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index d6e402aacb2a..003e1f7005c4 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -451,6 +451,7 @@ int mpi3mr_process_admin_reply_q(struct mpi3mr_ioc *mrioc) return 0; } + atomic_set(&mrioc->admin_pend_isr, 0); reply_desc = (struct mpi3_default_reply_descriptor *)mrioc->admin_reply_base + admin_reply_ci; @@ -2925,6 +2926,7 @@ static int mpi3mr_setup_admin_qpair(struct mpi3mr_ioc *mrioc) mrioc->admin_reply_ci = 0; mrioc->admin_reply_ephase = 1; atomic_set(&mrioc->admin_reply_q_in_use, 0); + atomic_set(&mrioc->admin_pend_isr, 0); if (!mrioc->admin_req_base) { mrioc->admin_req_base = dma_alloc_coherent(&mrioc->pdev->dev, @@ -4653,6 +4655,7 @@ void mpi3mr_memset_buffers(struct mpi3mr_ioc *mrioc) if (mrioc->admin_reply_base) memset(mrioc->admin_reply_base, 0, mrioc->admin_reply_q_sz); atomic_set(&mrioc->admin_reply_q_in_use, 0); + atomic_set(&mrioc->admin_pend_isr, 0); if (mrioc->init_cmds.reply) { memset(mrioc->init_cmds.reply, 0, sizeof(*mrioc->init_cmds.reply)); -- cgit v1.2.3 From 7f533cc5ee4c4436cee51dc58e81dfd9c3384418 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Tue, 24 Dec 2024 13:17:57 +0300 Subject: scsi: target: iscsi: Fix timeout on deleted connection NOPIN response timer may expire on a deleted connection and crash with such logs: Did not receive response to NOPIN on CID: 0, failing connection for I_T Nexus (null),i,0x00023d000125,iqn.2017-01.com.iscsi.target,t,0x3d BUG: Kernel NULL pointer dereference on read at 0x00000000 NIP strlcpy+0x8/0xb0 LR iscsit_fill_cxn_timeout_err_stats+0x5c/0xc0 [iscsi_target_mod] Call Trace: iscsit_handle_nopin_response_timeout+0xfc/0x120 [iscsi_target_mod] call_timer_fn+0x58/0x1f0 run_timer_softirq+0x740/0x860 __do_softirq+0x16c/0x420 irq_exit+0x188/0x1c0 timer_interrupt+0x184/0x410 That is because nopin response timer may be re-started on nopin timer expiration. Stop nopin timer before stopping the nopin response timer to be sure that no one of them will be re-started. Signed-off-by: Dmitry Bogdanov Link: https://lore.kernel.org/r/20241224101757.32300-1-d.bogdanov@yadro.com Reviewed-by: Maurizio Lombardi Signed-off-by: Martin K. Petersen --- drivers/target/iscsi/iscsi_target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 1244ef3aa86c..620ba6e0ab07 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -4263,8 +4263,8 @@ int iscsit_close_connection( spin_unlock(&iscsit_global->ts_bitmap_lock); iscsit_stop_timers_for_cmds(conn); - iscsit_stop_nopin_response_timer(conn); iscsit_stop_nopin_timer(conn); + iscsit_stop_nopin_response_timer(conn); if (conn->conn_transport->iscsit_wait_conn) conn->conn_transport->iscsit_wait_conn(conn); -- cgit v1.2.3 From f8cba9a700cf38b181df7c1d809cd73c6e1b2df9 Mon Sep 17 00:00:00 2001 From: Manish Pandey Date: Fri, 11 Apr 2025 17:46:29 +0530 Subject: scsi: ufs: qcom: Add quirks for Samsung UFS devices Introduce quirks for Samsung UFS devices to adjust PA TX HSG1 sync length and TX_HS_EQUALIZER settings on the Qualcomm UFS Host controller. This ensures proper functionality of Samsung UFS devices with the Qualcomm UFS Host controller. Signed-off-by: Manish Pandey Link: https://lore.kernel.org/r/20250411121630.21330-2-quic_mapa@quicinc.com Reviewed-by: Bean Huo Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/host/ufs-qcom.c | 43 +++++++++++++++++++++++++++++++++++++++++++ drivers/ufs/host/ufs-qcom.h | 18 ++++++++++++++++++ 2 files changed, 61 insertions(+) diff --git a/drivers/ufs/host/ufs-qcom.c b/drivers/ufs/host/ufs-qcom.c index 1b37449fbffc..c0761ccc1381 100644 --- a/drivers/ufs/host/ufs-qcom.c +++ b/drivers/ufs/host/ufs-qcom.c @@ -33,6 +33,10 @@ ((((c) >> 16) & MCQ_QCFGPTR_MASK) * MCQ_QCFGPTR_UNIT) #define MCQ_QCFG_SIZE 0x40 +/* De-emphasis for gear-5 */ +#define DEEMPHASIS_3_5_dB 0x04 +#define NO_DEEMPHASIS 0x0 + enum { TSTBUS_UAWM, TSTBUS_UARM, @@ -795,6 +799,23 @@ static int ufs_qcom_icc_update_bw(struct ufs_qcom_host *host) return ufs_qcom_icc_set_bw(host, bw_table.mem_bw, bw_table.cfg_bw); } +static void ufs_qcom_set_tx_hs_equalizer(struct ufs_hba *hba, u32 gear, u32 tx_lanes) +{ + u32 equalizer_val; + int ret, i; + + /* Determine the equalizer value based on the gear */ + equalizer_val = (gear == 5) ? DEEMPHASIS_3_5_dB : NO_DEEMPHASIS; + + for (i = 0; i < tx_lanes; i++) { + ret = ufshcd_dme_set(hba, UIC_ARG_MIB_SEL(TX_HS_EQUALIZER, i), + equalizer_val); + if (ret) + dev_err(hba->dev, "%s: failed equalizer lane %d\n", + __func__, i); + } +} + static int ufs_qcom_pwr_change_notify(struct ufs_hba *hba, enum ufs_notify_change_status status, const struct ufs_pa_layer_attr *dev_max_params, @@ -846,6 +867,11 @@ static int ufs_qcom_pwr_change_notify(struct ufs_hba *hba, dev_req_params->gear_tx, PA_INITIAL_ADAPT); } + + if (hba->dev_quirks & UFS_DEVICE_QUIRK_PA_TX_DEEMPHASIS_TUNING) + ufs_qcom_set_tx_hs_equalizer(hba, + dev_req_params->gear_tx, dev_req_params->lane_tx); + break; case POST_CHANGE: if (ufs_qcom_cfg_timers(hba, false)) { @@ -893,6 +919,16 @@ static int ufs_qcom_quirk_host_pa_saveconfigtime(struct ufs_hba *hba) (pa_vs_config_reg1 | (1 << 12))); } +static void ufs_qcom_override_pa_tx_hsg1_sync_len(struct ufs_hba *hba) +{ + int err; + + err = ufshcd_dme_peer_set(hba, UIC_ARG_MIB(PA_TX_HSG1_SYNC_LENGTH), + PA_TX_HSG1_SYNC_LENGTH_VAL); + if (err) + dev_err(hba->dev, "Failed (%d) set PA_TX_HSG1_SYNC_LENGTH\n", err); +} + static int ufs_qcom_apply_dev_quirks(struct ufs_hba *hba) { int err = 0; @@ -900,6 +936,9 @@ static int ufs_qcom_apply_dev_quirks(struct ufs_hba *hba) if (hba->dev_quirks & UFS_DEVICE_QUIRK_HOST_PA_SAVECONFIGTIME) err = ufs_qcom_quirk_host_pa_saveconfigtime(hba); + if (hba->dev_quirks & UFS_DEVICE_QUIRK_PA_TX_HSG1_SYNC_LENGTH) + ufs_qcom_override_pa_tx_hsg1_sync_len(hba); + return err; } @@ -914,6 +953,10 @@ static struct ufs_dev_quirk ufs_qcom_dev_fixups[] = { { .wmanufacturerid = UFS_VENDOR_WDC, .model = UFS_ANY_MODEL, .quirk = UFS_DEVICE_QUIRK_HOST_PA_TACTIVATE }, + { .wmanufacturerid = UFS_VENDOR_SAMSUNG, + .model = UFS_ANY_MODEL, + .quirk = UFS_DEVICE_QUIRK_PA_TX_HSG1_SYNC_LENGTH | + UFS_DEVICE_QUIRK_PA_TX_DEEMPHASIS_TUNING }, {} }; diff --git a/drivers/ufs/host/ufs-qcom.h b/drivers/ufs/host/ufs-qcom.h index d0e6ec9128e7..05d4cb569c50 100644 --- a/drivers/ufs/host/ufs-qcom.h +++ b/drivers/ufs/host/ufs-qcom.h @@ -122,8 +122,11 @@ enum { TMRLUT_HW_CGC_EN | OCSC_HW_CGC_EN) /* QUniPro Vendor specific attributes */ +#define PA_TX_HSG1_SYNC_LENGTH 0x1552 #define PA_VS_CONFIG_REG1 0x9000 #define DME_VS_CORE_CLK_CTRL 0xD002 +#define TX_HS_EQUALIZER 0x0037 + /* bit and mask definitions for DME_VS_CORE_CLK_CTRL attribute */ #define CLK_1US_CYCLES_MASK_V4 GENMASK(27, 16) #define CLK_1US_CYCLES_MASK GENMASK(7, 0) @@ -141,6 +144,21 @@ enum { #define UNIPRO_CORE_CLK_FREQ_201_5_MHZ 202 #define UNIPRO_CORE_CLK_FREQ_403_MHZ 403 +/* TX_HSG1_SYNC_LENGTH attr value */ +#define PA_TX_HSG1_SYNC_LENGTH_VAL 0x4A + +/* + * Some ufs device vendors need a different TSync length. + * Enable this quirk to give an additional TX_HS_SYNC_LENGTH. + */ +#define UFS_DEVICE_QUIRK_PA_TX_HSG1_SYNC_LENGTH BIT(16) + +/* + * Some ufs device vendors need a different Deemphasis setting. + * Enable this quirk to tune TX Deemphasis parameters. + */ +#define UFS_DEVICE_QUIRK_PA_TX_DEEMPHASIS_TUNING BIT(17) + /* ICE allocator type to share AES engines among TX stream and RX stream */ #define ICE_ALLOCATOR_TYPE 2 -- cgit v1.2.3 From 569330a34a31a52c904239439984a59972c11d28 Mon Sep 17 00:00:00 2001 From: Manish Pandey Date: Fri, 11 Apr 2025 17:46:30 +0530 Subject: scsi: ufs: Introduce quirk to extend PA_HIBERN8TIME for UFS devices MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Samsung UFS devices require additional time in hibern8 mode before exiting, beyond the negotiated handshaking phase between the host and device. Introduce a quirk to increase the PA_HIBERN8TIME parameter by 100 µs, a value derived from experiments, to ensure a proper hibernation process. Signed-off-by: Manish Pandey Link: https://lore.kernel.org/r/20250411121630.21330-3-quic_mapa@quicinc.com Reviewed-by: Bean Huo Reviewed-by: Manivannan Sadhasivam Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 29 +++++++++++++++++++++++++++++ include/ufs/ufs_quirks.h | 6 ++++++ 2 files changed, 35 insertions(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index 44156041d88f..ca2b0aae9d11 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -278,6 +278,7 @@ static const struct ufs_dev_quirk ufs_fixups[] = { .model = UFS_ANY_MODEL, .quirk = UFS_DEVICE_QUIRK_DELAY_BEFORE_LPM | UFS_DEVICE_QUIRK_HOST_PA_TACTIVATE | + UFS_DEVICE_QUIRK_PA_HIBER8TIME | UFS_DEVICE_QUIRK_RECOVERY_FROM_DL_NAC_ERRORS }, { .wmanufacturerid = UFS_VENDOR_SKHYNIX, .model = UFS_ANY_MODEL, @@ -8470,6 +8471,31 @@ out: return ret; } +/** + * ufshcd_quirk_override_pa_h8time - Ensures proper adjustment of PA_HIBERN8TIME. + * @hba: per-adapter instance + * + * Some UFS devices require specific adjustments to the PA_HIBERN8TIME parameter + * to ensure proper hibernation timing. This function retrieves the current + * PA_HIBERN8TIME value and increments it by 100us. + */ +static void ufshcd_quirk_override_pa_h8time(struct ufs_hba *hba) +{ + u32 pa_h8time; + int ret; + + ret = ufshcd_dme_get(hba, UIC_ARG_MIB(PA_HIBERN8TIME), &pa_h8time); + if (ret) { + dev_err(hba->dev, "Failed to get PA_HIBERN8TIME: %d\n", ret); + return; + } + + /* Increment by 1 to increase hibernation time by 100 µs */ + ret = ufshcd_dme_set(hba, UIC_ARG_MIB(PA_HIBERN8TIME), pa_h8time + 1); + if (ret) + dev_err(hba->dev, "Failed updating PA_HIBERN8TIME: %d\n", ret); +} + static void ufshcd_tune_unipro_params(struct ufs_hba *hba) { ufshcd_vops_apply_dev_quirks(hba); @@ -8480,6 +8506,9 @@ static void ufshcd_tune_unipro_params(struct ufs_hba *hba) if (hba->dev_quirks & UFS_DEVICE_QUIRK_HOST_PA_TACTIVATE) ufshcd_quirk_tune_host_pa_tactivate(hba); + + if (hba->dev_quirks & UFS_DEVICE_QUIRK_PA_HIBER8TIME) + ufshcd_quirk_override_pa_h8time(hba); } static void ufshcd_clear_dbg_ufs_stats(struct ufs_hba *hba) diff --git a/include/ufs/ufs_quirks.h b/include/ufs/ufs_quirks.h index 41ff44dfa1db..f52de5ed1b3b 100644 --- a/include/ufs/ufs_quirks.h +++ b/include/ufs/ufs_quirks.h @@ -107,4 +107,10 @@ struct ufs_dev_quirk { */ #define UFS_DEVICE_QUIRK_DELAY_AFTER_LPM (1 << 11) +/* + * Some ufs devices may need more time to be in hibern8 before exiting. + * Enable this quirk to give it an additional 100us. + */ +#define UFS_DEVICE_QUIRK_PA_HIBER8TIME (1 << 12) + #endif /* UFS_QUIRKS_H_ */ -- cgit v1.2.3 From 424eafe65647a8d6c690284536e711977153195a Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 7 Apr 2025 17:33:34 -0300 Subject: i2c: cros-ec-tunnel: defer probe if parent EC is not present When i2c-cros-ec-tunnel and the EC driver are built-in, the EC parent device will not be found, leading to NULL pointer dereference. That can also be reproduced by unbinding the controller driver and then loading i2c-cros-ec-tunnel module (or binding the device). [ 271.991245] BUG: kernel NULL pointer dereference, address: 0000000000000058 [ 271.998215] #PF: supervisor read access in kernel mode [ 272.003351] #PF: error_code(0x0000) - not-present page [ 272.008485] PGD 0 P4D 0 [ 272.011022] Oops: Oops: 0000 [#1] SMP NOPTI [ 272.015207] CPU: 0 UID: 0 PID: 3859 Comm: insmod Tainted: G S 6.15.0-rc1-00004-g44722359ed83 #30 PREEMPT(full) 3c7fb39a552e7d949de2ad921a7d6588d3a4fdc5 [ 272.030312] Tainted: [S]=CPU_OUT_OF_SPEC [ 272.034233] Hardware name: HP Berknip/Berknip, BIOS Google_Berknip.13434.356.0 05/17/2021 [ 272.042400] RIP: 0010:ec_i2c_probe+0x2b/0x1c0 [i2c_cros_ec_tunnel] [ 272.048577] Code: 1f 44 00 00 41 57 41 56 41 55 41 54 53 48 83 ec 10 65 48 8b 05 06 a0 6c e7 48 89 44 24 08 4c 8d 7f 10 48 8b 47 50 4c 8b 60 78 <49> 83 7c 24 58 00 0f 84 2f 01 00 00 48 89 fb be 30 06 00 00 4c 9 [ 272.067317] RSP: 0018:ffffa32082a03940 EFLAGS: 00010282 [ 272.072541] RAX: ffff969580b6a810 RBX: ffff969580b68c10 RCX: 0000000000000000 [ 272.079672] RDX: 0000000000000000 RSI: 0000000000000282 RDI: ffff969580b68c00 [ 272.086804] RBP: 00000000fffffdfb R08: 0000000000000000 R09: 0000000000000000 [ 272.093936] R10: 0000000000000000 R11: ffffffffc0600000 R12: 0000000000000000 [ 272.101067] R13: ffffffffa666fbb8 R14: ffffffffc05b5528 R15: ffff969580b68c10 [ 272.108198] FS: 00007b930906fc40(0000) GS:ffff969603149000(0000) knlGS:0000000000000000 [ 272.116282] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 272.122024] CR2: 0000000000000058 CR3: 000000012631c000 CR4: 00000000003506f0 [ 272.129155] Call Trace: [ 272.131606] [ 272.133709] ? acpi_dev_pm_attach+0xdd/0x110 [ 272.137985] platform_probe+0x69/0xa0 [ 272.141652] really_probe+0x152/0x310 [ 272.145318] __driver_probe_device+0x77/0x110 [ 272.149678] driver_probe_device+0x1e/0x190 [ 272.153864] __driver_attach+0x10b/0x1e0 [ 272.157790] ? driver_attach+0x20/0x20 [ 272.161542] bus_for_each_dev+0x107/0x150 [ 272.165553] bus_add_driver+0x15d/0x270 [ 272.169392] driver_register+0x65/0x110 [ 272.173232] ? cleanup_module+0xa80/0xa80 [i2c_cros_ec_tunnel 3a00532f3f4af4a9eade753f86b0f8dd4e4e5698] [ 272.182617] do_one_initcall+0x110/0x350 [ 272.186543] ? security_kernfs_init_security+0x49/0xd0 [ 272.191682] ? __kernfs_new_node+0x1b9/0x240 [ 272.195954] ? security_kernfs_init_security+0x49/0xd0 [ 272.201093] ? __kernfs_new_node+0x1b9/0x240 [ 272.205365] ? kernfs_link_sibling+0x105/0x130 [ 272.209810] ? kernfs_next_descendant_post+0x1c/0xa0 [ 272.214773] ? kernfs_activate+0x57/0x70 [ 272.218699] ? kernfs_add_one+0x118/0x160 [ 272.222710] ? __kernfs_create_file+0x71/0xa0 [ 272.227069] ? sysfs_add_bin_file_mode_ns+0xd6/0x110 [ 272.232033] ? internal_create_group+0x453/0x4a0 [ 272.236651] ? __vunmap_range_noflush+0x214/0x2d0 [ 272.241355] ? __free_frozen_pages+0x1dc/0x420 [ 272.245799] ? free_vmap_area_noflush+0x10a/0x1c0 [ 272.250505] ? load_module+0x1509/0x16f0 [ 272.254431] do_init_module+0x60/0x230 [ 272.258181] __se_sys_finit_module+0x27a/0x370 [ 272.262627] do_syscall_64+0x6a/0xf0 [ 272.266206] ? do_syscall_64+0x76/0xf0 [ 272.269956] ? irqentry_exit_to_user_mode+0x79/0x90 [ 272.274836] entry_SYSCALL_64_after_hwframe+0x55/0x5d [ 272.279887] RIP: 0033:0x7b9309168d39 [ 272.283466] Code: 5b 41 5c 5d c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d af 40 0c 00 f7 d8 64 89 01 8 [ 272.302210] RSP: 002b:00007fff50f1a288 EFLAGS: 00000246 ORIG_RAX: 0000000000000139 [ 272.309774] RAX: ffffffffffffffda RBX: 000058bf9b50f6d0 RCX: 00007b9309168d39 [ 272.316905] RDX: 0000000000000000 RSI: 000058bf6c103a77 RDI: 0000000000000003 [ 272.324036] RBP: 00007fff50f1a2e0 R08: 00007fff50f19218 R09: 0000000021ec4150 [ 272.331166] R10: 000058bf9b50f7f0 R11: 0000000000000246 R12: 0000000000000000 [ 272.338296] R13: 00000000fffffffe R14: 0000000000000000 R15: 000058bf6c103a77 [ 272.345428] [ 272.347617] Modules linked in: i2c_cros_ec_tunnel(+) [ 272.364585] gsmi: Log Shutdown Reason 0x03 Returning -EPROBE_DEFER will allow the device to be bound once the controller is bound, in the case of built-in drivers. Fixes: 9d230c9e4f4e ("i2c: ChromeOS EC tunnel driver") Signed-off-by: Thadeu Lima de Souza Cascardo Cc: # v3.16+ Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250407-null-ec-parent-v1-1-f7dda62d3110@igalia.com --- drivers/i2c/busses/i2c-cros-ec-tunnel.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/i2c/busses/i2c-cros-ec-tunnel.c b/drivers/i2c/busses/i2c-cros-ec-tunnel.c index 43bf90d90eeb..208ce4f9e782 100644 --- a/drivers/i2c/busses/i2c-cros-ec-tunnel.c +++ b/drivers/i2c/busses/i2c-cros-ec-tunnel.c @@ -247,6 +247,9 @@ static int ec_i2c_probe(struct platform_device *pdev) u32 remote_bus; int err; + if (!ec) + return dev_err_probe(dev, -EPROBE_DEFER, "couldn't find parent EC device\n"); + if (!ec->cmd_xfer) { dev_err(dev, "Missing sendrecv\n"); return -EINVAL; -- cgit v1.2.3 From cd35b6cb46649750b7dbd0df0e2d767415d8917b Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 1 Apr 2025 15:02:21 -0700 Subject: nfs: add missing selections of CONFIG_CRC32 nfs.ko, nfsd.ko, and lockd.ko all use crc32_le(), which is available only when CONFIG_CRC32 is enabled. But the only NFS kconfig option that selected CONFIG_CRC32 was CONFIG_NFS_DEBUG, which is client-specific and did not actually guard the use of crc32_le() even on the client. The code worked around this bug by only actually calling crc32_le() when CONFIG_CRC32 is built-in, instead hard-coding '0' in other cases. This avoided randconfig build errors, and in real kernels the fallback code was unlikely to be reached since CONFIG_CRC32 is 'default y'. But, this really needs to just be done properly, especially now that I'm planning to update CONFIG_CRC32 to not be 'default y'. Therefore, make CONFIG_NFS_FS, CONFIG_NFSD, and CONFIG_LOCKD select CONFIG_CRC32. Then remove the fallback code that becomes unnecessary, as well as the selection of CONFIG_CRC32 from CONFIG_NFS_DEBUG. Fixes: 1264a2f053a3 ("NFS: refactor code for calculating the crc32 hash of a filehandle") Signed-off-by: Eric Biggers Acked-by: Anna Schumaker Signed-off-by: Chuck Lever --- fs/Kconfig | 1 + fs/nfs/Kconfig | 2 +- fs/nfs/internal.h | 7 ------- fs/nfs/nfs4session.h | 4 ---- fs/nfsd/Kconfig | 1 + fs/nfsd/nfsfh.h | 7 ------- include/linux/nfs.h | 7 ------- 7 files changed, 3 insertions(+), 26 deletions(-) diff --git a/fs/Kconfig b/fs/Kconfig index 64d420e3c475..8fd1011f7d62 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -368,6 +368,7 @@ config GRACE_PERIOD config LOCKD tristate depends on FILE_LOCKING + select CRC32 select GRACE_PERIOD config LOCKD_V4 diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index d3f76101ad4b..07932ce9246c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -2,6 +2,7 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING && MULTIUSER + select CRC32 select LOCKD select SUNRPC select NFS_COMMON @@ -196,7 +197,6 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - select CRC32 default y config NFS_DISABLE_UDP_SUPPORT diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fae2c7ae4acc..59bb4d0338f3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -899,18 +899,11 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } -#ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } -#else -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) -{ - return 0; -} -#endif static inline bool nfs_error_is_fatal(int err) { diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 351616c61df5..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); } -#ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id * @session - pointer to session */ #define nfs_session_id_hash(sess_id) \ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 792d3fed1b45..731a88f6313e 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -4,6 +4,7 @@ config NFSD depends on INET depends on FILE_LOCKING depends on FSNOTIFY + select CRC32 select LOCKD select SUNRPC select EXPORTFS diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 876152a91f12..5103c2f4d225 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -267,7 +267,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1, return true; } -#ifdef CONFIG_CRC32 /** * knfsd_fh_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle @@ -279,12 +278,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size); } -#else -static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) -{ - return 0; -} -#endif /** * fh_clear_pre_post_attrs - Reset pre/post attributes diff --git a/include/linux/nfs.h b/include/linux/nfs.h index 9ad727ddfedb..0906a0b40c6a 100644 --- a/include/linux/nfs.h +++ b/include/linux/nfs.h @@ -55,7 +55,6 @@ enum nfs3_stable_how { NFS_INVALID_STABLE_HOW = -1 }; -#ifdef CONFIG_CRC32 /** * nfs_fhandle_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle @@ -67,10 +66,4 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } -#else /* CONFIG_CRC32 */ -static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) -{ - return 0; -} -#endif /* CONFIG_CRC32 */ #endif /* _LINUX_NFS_H */ -- cgit v1.2.3 From a1d14d931bf700c1025db8c46d6731aa5cf440f9 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 10 Apr 2025 09:57:08 +0800 Subject: nfsd: decrease sc_count directly if fail to queue dl_recall A deadlock warning occurred when invoking nfs4_put_stid following a failed dl_recall queue operation: T1 T2 nfs4_laundromat nfs4_get_client_reaplist nfs4_anylock_blockers __break_lease spin_lock // ctx->flc_lock spin_lock // clp->cl_lock nfs4_lockowner_has_blockers locks_owner_has_blockers spin_lock // flctx->flc_lock nfsd_break_deleg_cb nfsd_break_one_deleg nfs4_put_stid refcount_dec_and_lock spin_lock // clp->cl_lock When a file is opened, an nfs4_delegation is allocated with sc_count initialized to 1, and the file_lease holds a reference to the delegation. The file_lease is then associated with the file through kernel_setlease. The disassociation is performed in nfsd4_delegreturn via the following call chain: nfsd4_delegreturn --> destroy_delegation --> destroy_unhashed_deleg --> nfs4_unlock_deleg_lease --> kernel_setlease --> generic_delete_lease The corresponding sc_count reference will be released after this disassociation. Since nfsd_break_one_deleg executes while holding the flc_lock, the disassociation process becomes blocked when attempting to acquire flc_lock in generic_delete_lease. This means: 1) sc_count in nfsd_break_one_deleg will not be decremented to 0; 2) The nfs4_put_stid called by nfsd_break_one_deleg will not attempt to acquire cl_lock; 3) Consequently, no deadlock condition is created. Given that sc_count in nfsd_break_one_deleg remains non-zero, we can safely perform refcount_dec on sc_count directly. This approach effectively avoids triggering deadlock warnings. Fixes: 230ca758453c ("nfsd: put dl_stid if fail to queue dl_recall") Signed-off-by: Li Lingfeng Reviewed-by: Jeff Layton Signed-off-by: Chuck Lever --- fs/nfsd/nfs4state.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2041268b398a..59a693f22452 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -5430,7 +5430,7 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) queued = nfsd4_run_cb(&dp->dl_recall); WARN_ON_ONCE(!queued); if (!queued) - nfs4_put_stid(&dp->dl_stid); + refcount_dec(&dp->dl_stid.sc_count); } /* Called from break_lease() with flc_lock held. */ -- cgit v1.2.3 From 289cae889a7464281b44df7f777fd5238ddfad7f Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 7 Apr 2025 15:29:50 +0200 Subject: MAINTAINERS: pci: add entry for Rust PCI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bjorn, Krzysztof and I agreed that I will maintain the Rust PCI code. Therefore, create a new entry in the MAINTAINERS file. Acked-by: Bjorn Helgaas Acked-by: Krzysztof Wilczyński Link: https://lore.kernel.org/r/20250407133059.164042-1-dakr@kernel.org [ Align Krzysztof's email address. - Danilo ] Signed-off-by: Danilo Krummrich --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 96b827049501..778d7d168be3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18686,6 +18686,16 @@ F: include/asm-generic/pci* F: include/linux/of_pci.h F: include/linux/pci* F: include/uapi/linux/pci* + +PCI SUBSYSTEM [RUST] +M: Danilo Krummrich +R: Bjorn Helgaas +R: Krzysztof Wilczyński +L: linux-pci@vger.kernel.org +S: Maintained +C: irc://irc.oftc.net/linux-pci +T: git git://git.kernel.org/pub/scm/linux/kernel/git/pci/pci.git +F: rust/helpers/pci.c F: rust/kernel/pci.rs F: samples/rust/rust_driver_pci.rs -- cgit v1.2.3 From 53bd97801632c940767f4c8407c2cbdeb56b40e7 Mon Sep 17 00:00:00 2001 From: Christian Schrefl Date: Sun, 13 Apr 2025 21:26:56 +0200 Subject: rust: firmware: Use `ffi::c_char` type in `FwFunc` The `FwFunc` struct contains an function with a char pointer argument, for which a `*const u8` pointer was used. This is not really the "proper" type for this, so use a `*const kernel::ffi::c_char` pointer instead. This has no real functionality changes, since now `kernel::ffi::c_char` (which bindgen uses for `char`) is now a type alias to `u8` anyways, but before commit 1bae8729e50a ("rust: map `long` to `isize` and `char` to `u8`") the concrete type of `kernel::ffi::c_char` depended on the architecture (However all supported architectures at the time mapped to `i8`). This caused problems on the v6.13 tag when building for 32 bit arm (with my patches), since back then `*const i8` was used in the function argument and the function that bindgen generated used `*const core::ffi::c_char` which Rust mapped to `*const u8` on 32 bit arm. The stable v6.13.y branch does not have this issue since commit 1bae8729e50a ("rust: map `long` to `isize` and `char` to `u8`") was backported. This caused the following build error: ``` error[E0308]: mismatched types --> rust/kernel/firmware.rs:20:4 | 20 | Self(bindings::request_firmware) | ---- ^^^^^^^^^^^^^^^^^^^^^^^^^^ expected fn pointer, found fn item | | | arguments to this function are incorrect | = note: expected fn pointer `unsafe extern "C" fn(_, *const i8, _) -> _` found fn item `unsafe extern "C" fn(_, *const u8, _) -> _ {request_firmware}` note: tuple struct defined here --> rust/kernel/firmware.rs:14:8 | 14 | struct FwFunc( | ^^^^^^ error[E0308]: mismatched types --> rust/kernel/firmware.rs:24:14 | 24 | Self(bindings::firmware_request_nowarn) | ---- ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ expected fn pointer, found fn item | | | arguments to this function are incorrect | = note: expected fn pointer `unsafe extern "C" fn(_, *const i8, _) -> _` found fn item `unsafe extern "C" fn(_, *const u8, _) -> _ {firmware_request_nowarn}` note: tuple struct defined here --> rust/kernel/firmware.rs:14:8 | 14 | struct FwFunc( | ^^^^^^ error[E0308]: mismatched types --> rust/kernel/firmware.rs:64:45 | 64 | let ret = unsafe { func.0(pfw as _, name.as_char_ptr(), dev.as_raw()) }; | ------ ^^^^^^^^^^^^^^^^^^ expected `*const i8`, found `*const u8` | | | arguments to this function are incorrect | = note: expected raw pointer `*const i8` found raw pointer `*const u8` error: aborting due to 3 previous errors ``` Fixes: de6582833db0 ("rust: add firmware abstractions") Cc: stable@vger.kernel.org Reviewed-by: Benno Lossin Signed-off-by: Christian Schrefl Acked-by: Miguel Ojeda Link: https://lore.kernel.org/r/20250413-rust_arm_fix_fw_abstaction-v3-1-8dd7c0bbcd47@gmail.com [ Add firmware prefix to commit subject. - Danilo ] Signed-off-by: Danilo Krummrich --- rust/kernel/firmware.rs | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/rust/kernel/firmware.rs b/rust/kernel/firmware.rs index f04b058b09b2..2494c96e105f 100644 --- a/rust/kernel/firmware.rs +++ b/rust/kernel/firmware.rs @@ -4,7 +4,7 @@ //! //! C header: [`include/linux/firmware.h`](srctree/include/linux/firmware.h) -use crate::{bindings, device::Device, error::Error, error::Result, str::CStr}; +use crate::{bindings, device::Device, error::Error, error::Result, ffi, str::CStr}; use core::ptr::NonNull; /// # Invariants @@ -12,7 +12,11 @@ use core::ptr::NonNull; /// One of the following: `bindings::request_firmware`, `bindings::firmware_request_nowarn`, /// `bindings::firmware_request_platform`, `bindings::request_firmware_direct`. struct FwFunc( - unsafe extern "C" fn(*mut *const bindings::firmware, *const u8, *mut bindings::device) -> i32, + unsafe extern "C" fn( + *mut *const bindings::firmware, + *const ffi::c_char, + *mut bindings::device, + ) -> i32, ); impl FwFunc { -- cgit v1.2.3 From 16c22c56d4282584742022a37d4f79a46ca6094a Mon Sep 17 00:00:00 2001 From: Daniel Jurgens Date: Tue, 4 Mar 2025 10:14:42 -0600 Subject: virtio_pci: Use self group type for cap commands Section 2.12.1.2 of v1.4 of the VirtIO spec states: The device and driver capabilities commands are currently defined for self group type. 1. VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY 2. VIRTIO_ADMIN_CMD_DEVICE_CAP_GET 3. VIRTIO_ADMIN_CMD_DRIVER_CAP_SET Fixes: bfcad518605d ("virtio: Manage device and driver capabilities via the admin commands") Signed-off-by: Daniel Jurgens Reviewed-by: Parav Pandit Message-Id: <20250304161442.90700-1-danielj@nvidia.com> Signed-off-by: Michael S. Tsirkin --- drivers/virtio/virtio_pci_modern.c | 4 ++-- include/uapi/linux/virtio_pci.h | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 5eaade757860..d50fe030d825 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -247,7 +247,7 @@ virtio_pci_admin_cmd_dev_parts_objects_enable(struct virtio_device *virtio_dev) sg_init_one(&data_sg, get_data, sizeof(*get_data)); sg_init_one(&result_sg, result, sizeof(*result)); cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET); - cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SELF); cmd.data_sg = &data_sg; cmd.result_sg = &result_sg; ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); @@ -305,7 +305,7 @@ static void virtio_pci_admin_cmd_cap_init(struct virtio_device *virtio_dev) sg_init_one(&result_sg, data, sizeof(*data)); cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY); - cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SELF); cmd.result_sg = &result_sg; ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index 8549d4571257..c691ac210ce2 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -246,6 +246,7 @@ struct virtio_pci_cfg_cap { #define VIRTIO_ADMIN_CMD_LIST_USE 0x1 /* Admin command group type. */ +#define VIRTIO_ADMIN_GROUP_TYPE_SELF 0x0 #define VIRTIO_ADMIN_GROUP_TYPE_SRIOV 0x1 /* Transitional device admin command. */ -- cgit v1.2.3 From a940e0a685575424d33324ec7f0089045249de0a Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Mon, 3 Mar 2025 09:52:37 +0100 Subject: vhost: fix VHOST_*_OWNER documentation VHOST_OWNER_SET and VHOST_OWNER_RESET are used in the documentation instead of VHOST_SET_OWNER and VHOST_RESET_OWNER respectively. To avoid confusion, let's use the right names in the documentation. No change to the API, only the documentation is involved. Signed-off-by: Stefano Garzarella Message-Id: <20250303085237.19990-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin --- include/uapi/linux/vhost.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index b95dd84eef2d..d4b3e2ae1314 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -28,10 +28,10 @@ /* Set current process as the (exclusive) owner of this file descriptor. This * must be called before any other vhost command. Further calls to - * VHOST_OWNER_SET fail until VHOST_OWNER_RESET is called. */ + * VHOST_SET_OWNER fail until VHOST_RESET_OWNER is called. */ #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01) /* Give up ownership, and reset the device to default values. - * Allows subsequent call to VHOST_OWNER_SET to succeed. */ + * Allows subsequent call to VHOST_SET_OWNER to succeed. */ #define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02) /* Set up/modify memory layout */ -- cgit v1.2.3 From 2e2f925fe737576df2373931c95e1a2b66efdfef Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Wed, 12 Mar 2025 21:04:12 +0800 Subject: virtio_ring: Fix data race by tagging event_triggered as racy for KCSAN syzbot reports a data-race when accessing the event_triggered, here is the simplified stack when the issue occurred: ================================================================== BUG: KCSAN: data-race in virtqueue_disable_cb / virtqueue_enable_cb_delayed write to 0xffff8881025bc452 of 1 bytes by task 3288 on cpu 0: virtqueue_enable_cb_delayed+0x42/0x3c0 drivers/virtio/virtio_ring.c:2653 start_xmit+0x230/0x1310 drivers/net/virtio_net.c:3264 __netdev_start_xmit include/linux/netdevice.h:5151 [inline] netdev_start_xmit include/linux/netdevice.h:5160 [inline] xmit_one net/core/dev.c:3800 [inline] read to 0xffff8881025bc452 of 1 bytes by interrupt on cpu 1: virtqueue_disable_cb_split drivers/virtio/virtio_ring.c:880 [inline] virtqueue_disable_cb+0x92/0x180 drivers/virtio/virtio_ring.c:2566 skb_xmit_done+0x5f/0x140 drivers/net/virtio_net.c:777 vring_interrupt+0x161/0x190 drivers/virtio/virtio_ring.c:2715 __handle_irq_event_percpu+0x95/0x490 kernel/irq/handle.c:158 handle_irq_event_percpu kernel/irq/handle.c:193 [inline] value changed: 0x01 -> 0x00 ================================================================== When the data race occurs, the function virtqueue_enable_cb_delayed() sets event_triggered to false, and virtqueue_disable_cb_split/packed() reads it as false due to the race condition. Since event_triggered is an unreliable hint used for optimization, this should only cause the driver temporarily suggest that the device not send an interrupt notification when the event index is used. Fix this KCSAN reported data-race issue by explicitly tagging the access as data_racy. Reported-by: syzbot+efe683d57990864b8c8e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67c7761a.050a0220.15b4b9.0018.GAE@google.com/ Signed-off-by: Zhongqiu Han Message-Id: <20250312130412.3516307-1-quic_zhonhan@quicinc.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang --- drivers/virtio/virtio_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index fdd2d2b07b5a..b784aab66867 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2650,7 +2650,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->event_triggered) - vq->event_triggered = false; + data_race(vq->event_triggered = false); return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : virtqueue_enable_cb_delayed_split(_vq); -- cgit v1.2.3 From 0866ee8e50f017731b80891294c0edd0f5fcd0a9 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Thu, 3 Apr 2025 18:38:05 +0200 Subject: rust: disable `clippy::needless_continue` Starting with Rust 1.86.0, Clippy's `needless_continue` lint complains about the last statement of a loop [1], including cases like: while ... { match ... { ... if ... => { ... return ...; } _ => continue, } } as well as nested `match`es in a loop. One solution is changing `continue` for `()` [2], but arguably using `continue` shows the intent better when it is alone in an arm like that. Moreover, I am not sure we want to force people to try to find other ways to write the code either, in cases when that applies. In addition, the help text does not really apply in the new cases the lint has introduced, e.g. here one cannot simply "drop" the expression: warning: this `continue` expression is redundant --> rust/macros/helpers.rs:85:18 | 85 | _ => continue, | ^^^^^^^^ | = help: consider dropping the `continue` expression = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#needless_continue = note: requested on the command line with `-W clippy::needless-continue` The examples in the documentation do not show a case like this, either, so the second "help" line does not help. In addition, locally disabling the lint is not possible with `expect`, since the behavior differs across versions. Using `allow` would be possible, but, even then, an extra line just for this is a bit too much, especially if there are other ways to satisfy the lint. Finally, the lint is still in the "pedantic" category and disabled by default by Clippy. Thus disable the lint, at least for the time being. Feedback was submitted to upstream Clippy, in case this can be improved or perhaps the lint split into several [3]. Cc: stable@vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs). Link: https://github.com/rust-lang/rust-clippy/pull/13891 [1] Link: https://lore.kernel.org/rust-for-linux/20250401221205.52381-1-ojeda@kernel.org/ [2] Link: https://github.com/rust-lang/rust-clippy/issues/14536 [3] Link: https://lore.kernel.org/r/20250403163805.67770-1-ojeda@kernel.org Reviewed-by: Alice Ryhl Signed-off-by: Miguel Ojeda --- Makefile | 1 - 1 file changed, 1 deletion(-) diff --git a/Makefile b/Makefile index 38689a0c3605..3111475fdc92 100644 --- a/Makefile +++ b/Makefile @@ -477,7 +477,6 @@ export rust_common_flags := --edition=2021 \ -Wclippy::ignored_unit_patterns \ -Wclippy::mut_mut \ -Wclippy::needless_bitwise_bool \ - -Wclippy::needless_continue \ -Aclippy::needless_lifetimes \ -Wclippy::no_mangle_with_rust_abi \ -Wclippy::undocumented_unsafe_blocks \ -- cgit v1.2.3 From 2042c352e21d19eaf5f9e22fb6afce72293ef28c Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Mon, 14 Apr 2025 21:37:52 +1000 Subject: dma/mapping.c: dev_dbg support for dma_addressing_limited MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the debug and resolution of an issue involving forced use of bounce buffers, 7170130e4c72 ("x86/mm/init: Handle the special case of device private pages in add_pages(), to not increase max_pfn and trigger dma_addressing_limited() bounce buffers"). It would have been easier to debug the issue if dma_addressing_limited() had debug information about the device not being able to address all of memory and thus forcing all accesses through a bounce buffer. Please see[2] Implement dev_dbg to debug the potential use of bounce buffers when we hit the condition. When swiotlb is used, dma_addressing_limited() is used to determine the size of maximum dma buffer size in dma_direct_max_mapping_size(). The debug prints could be triggered in that check as well (when enabled). Link: https://lore.kernel.org/lkml/20250401000752.249348-1-balbirs@nvidia.com/ [1] Link: https://lore.kernel.org/lkml/20250310112206.4168-1-spasswolf@web.de/ [2] Cc: Marek Szyprowski Cc: Robin Murphy Cc: "Christian König" Cc: Ingo Molnar Cc: Kees Cook Cc: Bjorn Helgaas Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Andy Lutomirski Cc: Alex Deucher Cc: Bert Karwatzki Cc: Christoph Hellwig Signed-off-by: Balbir Singh Reviewed-by: Christoph Hellwig Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250414113752.3298276-1-balbirs@nvidia.com --- kernel/dma/mapping.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index cda127027e48..67da08fa6723 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -918,7 +918,7 @@ EXPORT_SYMBOL(dma_set_coherent_mask); * the system, else %false. Lack of addressing bits is the prime reason for * bounce buffering, but might not be the only one. */ -bool dma_addressing_limited(struct device *dev) +static bool __dma_addressing_limited(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -930,6 +930,15 @@ bool dma_addressing_limited(struct device *dev) return false; return !dma_direct_all_ram_mapped(dev); } + +bool dma_addressing_limited(struct device *dev) +{ + if (!__dma_addressing_limited(dev)) + return false; + + dev_dbg(dev, "device is DMA addressing limited\n"); + return true; +} EXPORT_SYMBOL_GPL(dma_addressing_limited); size_t dma_max_mapping_size(struct device *dev) -- cgit v1.2.3 From 46e24a545cdb4556f8128c90ecc34eeae52477a0 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Wed, 9 Apr 2025 00:03:11 +0200 Subject: rust: kasan/kbuild: fix missing flags on first build If KASAN is enabled, and one runs in a clean repository e.g.: make LLVM=1 prepare make LLVM=1 prepare Then the Rust code gets rebuilt, which should not happen. The reason is some of the LLVM KASAN `rustc` flags are added in the second run: -Cllvm-args=-asan-instrumentation-with-call-threshold=10000 -Cllvm-args=-asan-stack=0 -Cllvm-args=-asan-globals=1 -Cllvm-args=-asan-kernel-mem-intrinsic-prefix=1 Further runs do not rebuild Rust because the flags do not change anymore. Rebuilding like that in the second run is bad, even if this just happens with KASAN enabled, but missing flags in the first one is even worse. The root issue is that we pass, for some architectures and for the moment, a generated `target.json` file. That file is not ready by the time `rustc` gets called for the flag test, and thus the flag test fails just because the file is not available, e.g.: $ ... --target=./scripts/target.json ... -Cllvm-args=... error: target file "./scripts/target.json" does not exist There are a few approaches we could take here to solve this. For instance, we could ensure that every time that the config is rebuilt, we regenerate the file and recompute the flags. Or we could use the LLVM version to check for these flags, instead of testing the flag (which may have other advantages, such as allowing us to detect renames on the LLVM side). However, it may be easier than that: `rustc` is aware of the `-Cllvm-args` regardless of the `--target` (e.g. I checked that the list printed is the same, plus that I can check for these flags even if I pass a completely unrelated target), and thus we can just eliminate the dependency completely. Thus filter out the target. This does mean that `rustc-option` cannot be used to test a flag that requires the right target, but we don't have other users yet, it is a minimal change and we want to get rid of custom targets in the future. We could only filter in the case `target.json` is used, to make it work in more cases, but then it would be harder to notice that it may not work in a couple architectures. Cc: Matthew Maurer Cc: Sami Tolvanen Cc: stable@vger.kernel.org Fixes: e3117404b411 ("kbuild: rust: Enable KASAN support") Tested-by: Alice Ryhl Link: https://lore.kernel.org/r/20250408220311.1033475-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/Makefile.compiler | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.compiler b/scripts/Makefile.compiler index 8956587b8547..7ed7f92a7daa 100644 --- a/scripts/Makefile.compiler +++ b/scripts/Makefile.compiler @@ -80,7 +80,7 @@ ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) # TODO: remove RUSTC_BOOTSTRAP=1 when we raise the minimum GNU Make version to 4.4 __rustc-option = $(call try-run,\ echo '#![allow(missing_docs)]#![feature(no_core)]#![no_core]' | RUSTC_BOOTSTRAP=1\ - $(1) --sysroot=/dev/null $(filter-out --sysroot=/dev/null,$(2)) $(3)\ + $(1) --sysroot=/dev/null $(filter-out --sysroot=/dev/null --target=%,$(2)) $(3)\ --crate-type=rlib --out-dir=$(TMPOUT) --emit=obj=- - >/dev/null,$(3),$(4)) # rustc-option -- cgit v1.2.3 From a3cd5f507b72c0532c3345b6913557efab34f405 Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Sun, 13 Apr 2025 02:23:38 +0200 Subject: objtool/rust: add one more `noreturn` Rust function for Rust 1.86.0 Starting with Rust 1.86.0 (see upstream commit b151b513ba2b ("Insert null checks for pointer dereferences when debug assertions are enabled") [1]), under some kernel configurations with `CONFIG_RUST_DEBUG_ASSERTIONS=y`, one may trigger a new `objtool` warning: rust/kernel.o: warning: objtool: _R..._6kernel9workqueue6system() falls through to next function _R...9workqueue14system_highpri() due to a call to the `noreturn` symbol: core::panicking::panic_null_pointer_dereference Thus add it to the list so that `objtool` knows it is actually `noreturn`. See commit 56d680dd23c3 ("objtool/rust: list `noreturn` Rust functions") for more details. Cc: stable@vger.kernel.org # Needed in 6.12.y and later (Rust is pinned in older LTSs). Fixes: 56d680dd23c3 ("objtool/rust: list `noreturn` Rust functions") Link: https://github.com/rust-lang/rust/commit/b151b513ba2b65c7506ec1a80f2712bbd09154d1 [1] Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250413002338.1741593-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- tools/objtool/check.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 4a1f6c3169b3..67006eeb30c8 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -225,6 +225,7 @@ static bool is_rust_noreturn(const struct symbol *func) str_ends_with(func->name, "_4core9panicking14panic_nounwind") || str_ends_with(func->name, "_4core9panicking18panic_bounds_check") || str_ends_with(func->name, "_4core9panicking19assert_failed_inner") || + str_ends_with(func->name, "_4core9panicking30panic_null_pointer_dereference") || str_ends_with(func->name, "_4core9panicking36panic_misaligned_pointer_dereference") || strstr(func->name, "_4core9panicking13assert_failed") || strstr(func->name, "_4core9panicking11panic_const24panic_const_") || -- cgit v1.2.3 From ec0c7afa70d5ccec44e736b60ed2e7c191d054cb Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Mon, 14 Apr 2025 14:27:01 +0530 Subject: drm/i915/display: Add macro for checking 3 DSC engines 3 DSC engines per pipe is currently supported only for BMG. Add a macro to check whether a platform supports 3 DSC engines per pipe. v2:Fix Typo in macro argument. (Suraj). Added fixes tag. Bspec: 50175 Fixes: be7f5fcdf4a0 ("drm/i915/dp: Enable 3 DSC engines for 12 slices") Cc: Ankit Nautiyal Cc: Suraj Kandpal Cc: # v6.14+ Signed-off-by: Ankit Nautiyal Reviewed-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250414085701.2802374-1-ankit.k.nautiyal@intel.com (cherry picked from commit 6998cfce0e1db58c730d08cadc6bfd71e26e2de0) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_display_device.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/display/intel_display_device.h b/drivers/gpu/drm/i915/display/intel_display_device.h index 717286981687..7a3bb77c7af7 100644 --- a/drivers/gpu/drm/i915/display/intel_display_device.h +++ b/drivers/gpu/drm/i915/display/intel_display_device.h @@ -161,6 +161,7 @@ struct intel_display_platforms { #define HAS_DPT(__display) (DISPLAY_VER(__display) >= 13) #define HAS_DSB(__display) (DISPLAY_INFO(__display)->has_dsb) #define HAS_DSC(__display) (DISPLAY_RUNTIME_INFO(__display)->has_dsc) +#define HAS_DSC_3ENGINES(__display) (DISPLAY_VERx100(__display) == 1401 && HAS_DSC(__display)) #define HAS_DSC_MST(__display) (DISPLAY_VER(__display) >= 12 && HAS_DSC(__display)) #define HAS_FBC(__display) (DISPLAY_RUNTIME_INFO(__display)->fbc_mask != 0) #define HAS_FBC_DIRTY_RECT(__display) (DISPLAY_VER(__display) >= 30) -- cgit v1.2.3 From 3a47280b768748992ee34bd52c394c60b2845af3 Mon Sep 17 00:00:00 2001 From: Ankit Nautiyal Date: Mon, 14 Apr 2025 08:12:56 +0530 Subject: drm/i915/dp: Check for HAS_DSC_3ENGINES while configuring DSC slices DSC 12 slices configuration is used for some specific cases with Ultrajoiner. This can be supported only when each of the 4 joined pipes have 3 DSC engines each. Add the missing check for 3 DSC engines support before using 3 DSC slices per pipe. Fixes: be7f5fcdf4a0 ("drm/i915/dp: Enable 3 DSC engines for 12 slices") Cc: Ankit Nautiyal Cc: Suraj Kandpal Cc: # v6.14+ Signed-off-by: Ankit Nautiyal Reviewed-by: Suraj Kandpal Link: https://lore.kernel.org/r/20250414024256.2782702-3-ankit.k.nautiyal@intel.com (cherry picked from commit da9b1c61e7f7b327dd70c5f073ba04d419a55ef8) Signed-off-by: Jani Nikula --- drivers/gpu/drm/i915/display/intel_dp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_dp.c b/drivers/gpu/drm/i915/display/intel_dp.c index 9476aaa91900..392c3653d0d7 100644 --- a/drivers/gpu/drm/i915/display/intel_dp.c +++ b/drivers/gpu/drm/i915/display/intel_dp.c @@ -1050,10 +1050,11 @@ u8 intel_dp_dsc_get_slice_count(const struct intel_connector *connector, u8 test_slice_count = valid_dsc_slicecount[i] * num_joined_pipes; /* - * 3 DSC Slices per pipe need 3 DSC engines, - * which is supported only with Ultrajoiner. + * 3 DSC Slices per pipe need 3 DSC engines, which is supported only + * with Ultrajoiner only for some platforms. */ - if (valid_dsc_slicecount[i] == 3 && num_joined_pipes != 4) + if (valid_dsc_slicecount[i] == 3 && + (!HAS_DSC_3ENGINES(display) || num_joined_pipes != 4)) continue; if (test_slice_count > -- cgit v1.2.3 From c443279a87d54bf3027925cb3eb2baf51c3b26c9 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 15 Apr 2025 10:22:04 +0200 Subject: Kconfig: switch CONFIG_SYSFS_SYCALL default to n This odd system call will be removed in the future. Let's decouple it from CONFIG_EXPERT and switch the default to n as a first step. Link: https://lore.kernel.org/20250415-dezimieren-wertpapier-9fd18a211a41@brauner Signed-off-by: Christian Brauner --- init/Kconfig | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/init/Kconfig b/init/Kconfig index dd2ea3b9a799..63f5974b9fa6 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1555,6 +1555,16 @@ config SYSCTL_ARCH_UNALIGN_ALLOW the unaligned access emulation. see arch/parisc/kernel/unaligned.c for reference +config SYSFS_SYSCALL + bool "Sysfs syscall support" + default n + help + sys_sysfs is an obsolete system call no longer supported in libc. + Note that disabling this option is more secure but might break + compatibility with some systems. + + If unsure say N here. + config HAVE_PCSPKR_PLATFORM bool @@ -1599,16 +1609,6 @@ config SGETMASK_SYSCALL If unsure, leave the default option here. -config SYSFS_SYSCALL - bool "Sysfs syscall support" if EXPERT - default y - help - sys_sysfs is an obsolete system call no longer supported in libc. - Note that disabling this option is more secure but might break - compatibility with some systems. - - If unsure say Y here. - config FHANDLE bool "open by fhandle syscalls" if EXPERT select EXPORTFS -- cgit v1.2.3 From ddee68c499f76ae47c011549df5be53db0057402 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 15 Apr 2025 09:45:38 +0200 Subject: hfs{plus}: add deprecation warning Both the hfs and hfsplus filesystem have been orphaned since at least 2014, i.e., over 10 years. It's time to remove them from the kernel as they're exhibiting more and more issues and no one is stepping up to fixing them. Signed-off-by: Christian Brauner --- fs/hfs/super.c | 2 ++ fs/hfsplus/super.c | 2 ++ 2 files changed, 4 insertions(+) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index fe09c2093a93..4413cd8feb9e 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -404,6 +404,8 @@ static int hfs_init_fs_context(struct fs_context *fc) { struct hfs_sb_info *hsb; + pr_warn("The hfs filesystem is deprecated and scheduled to be removed from the kernel in 2025\n"); + hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); if (!hsb) return -ENOMEM; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 948b8aaee33e..58cff4b2a3b4 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -656,6 +656,8 @@ static int hfsplus_init_fs_context(struct fs_context *fc) { struct hfsplus_sb_info *sbi; + pr_warn("The hfsplus filesystem is deprecated and scheduled to be removed from the kernel in 2025\n"); + sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; -- cgit v1.2.3 From c86b300b1ea35959a6e2a63a6497226a6ea90b67 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Apr 2025 22:13:33 +0200 Subject: fs: add kern_path_locked_negative() The audit code relies on the fact that kern_path_locked() returned a path even for a negative dentry. If it doesn't find a valid dentry it immediately calls: audit_find_parent(d_backing_inode(parent_path.dentry)); which assumes that parent_path.dentry is still valid. But it isn't since kern_path_locked() has been changed to path_put() also for a negative dentry. Fix this by adding a helper that implements the required audit semantics and allows us to fix the immediate bleeding. We can find a unified solution for this afterwards. Link: https://lore.kernel.org/20250414-rennt-wimmeln-f186c3a780f1@brauner Fixes: 1c3cb50b58c3 ("VFS: change kern_path_locked() and user_path_locked_at() to never return negative dentry") Reported-and-tested-by: Vlastimil Babka Signed-off-by: Christian Brauner --- fs/namei.c | 65 ++++++++++++++++++++++++++++++++++++++------------- include/linux/namei.h | 1 + kernel/audit_watch.c | 16 ++++++++----- 3 files changed, 60 insertions(+), 22 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 8510ff53f12e..267ac9c22dd2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1665,27 +1665,20 @@ static struct dentry *lookup_dcache(const struct qstr *name, return dentry; } -/* - * Parent directory has inode locked exclusive. This is one - * and only case when ->lookup() gets called on non in-lookup - * dentries - as the matter of fact, this only gets called - * when directory is guaranteed to have no in-lookup children - * at all. - * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. - * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. - */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, - unsigned int flags) +static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, + struct dentry *base, + unsigned int flags) { - struct dentry *dentry = lookup_dcache(name, base, flags); + struct dentry *dentry; struct dentry *old; - struct inode *dir = base->d_inode; + struct inode *dir; + dentry = lookup_dcache(name, base, flags); if (dentry) - goto found; + return dentry; /* Don't create child dentry for a dead directory. */ + dir = base->d_inode; if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); @@ -1698,7 +1691,24 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, dput(dentry); dentry = old; } -found: + return dentry; +} + +/* + * Parent directory has inode locked exclusive. This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. + */ +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) +{ + struct dentry *dentry; + + dentry = lookup_one_qstr_excl_raw(name, base, flags); if (IS_ERR(dentry)) return dentry; if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { @@ -2762,6 +2772,29 @@ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct return d; } +struct dentry *kern_path_locked_negative(const char *name, struct path *path) +{ + struct filename *filename __free(putname) = getname_kernel(name); + struct dentry *d; + struct qstr last; + int type, error; + + error = filename_parentat(AT_FDCWD, filename, 0, path, &last, &type); + if (error) + return ERR_PTR(error); + if (unlikely(type != LAST_NORM)) { + path_put(path); + return ERR_PTR(-EINVAL); + } + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl_raw(&last, path->dentry, 0); + if (IS_ERR(d)) { + inode_unlock(path->dentry->d_inode); + path_put(path); + } + return d; +} + struct dentry *kern_path_locked(const char *name, struct path *path) { struct filename *filename = getname_kernel(name); diff --git a/include/linux/namei.h b/include/linux/namei.h index e3042176cdf4..bbaf55fb3101 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -62,6 +62,7 @@ extern struct dentry *kern_path_create(int, const char *, struct path *, unsigne extern struct dentry *user_path_create(int, const char __user *, struct path *, unsigned int); extern void done_path_create(struct path *, struct dentry *); extern struct dentry *kern_path_locked(const char *, struct path *); +extern struct dentry *kern_path_locked_negative(const char *, struct path *); extern struct dentry *user_path_locked_at(int , const char __user *, struct path *); int vfs_path_parent_lookup(struct filename *filename, unsigned int flags, struct path *parent, struct qstr *last, int *type, diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 367eaf2c78b7..0ebbbe37a60f 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -347,12 +347,17 @@ static void audit_remove_parent_watches(struct audit_parent *parent) /* Get path information necessary for adding watches. */ static int audit_get_nd(struct audit_watch *watch, struct path *parent) { - struct dentry *d = kern_path_locked(watch->path, parent); + struct dentry *d; + + d = kern_path_locked_negative(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - /* update watch filter fields */ - watch->dev = d->d_sb->s_dev; - watch->ino = d_backing_inode(d)->i_ino; + + if (d_is_positive(d)) { + /* update watch filter fields */ + watch->dev = d->d_sb->s_dev; + watch->ino = d_backing_inode(d)->i_ino; + } inode_unlock(d_backing_inode(parent->dentry)); dput(d); @@ -418,11 +423,10 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list) /* caller expects mutex locked */ mutex_lock(&audit_filter_mutex); - if (ret && ret != -ENOENT) { + if (ret) { audit_put_watch(watch); return ret; } - ret = 0; /* either find an old parent or attach a new one */ parent = audit_find_parent(d_backing_inode(parent_path.dentry)); -- cgit v1.2.3 From 8e553520596bbd5ce832e26e9d721e6a0c797b8b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 31 Mar 2025 13:56:08 +0100 Subject: intel_th: avoid using deprecated page->mapping, index fields The struct page->mapping, index fields are deprecated and soon to be only available as part of a folio. It is likely the intel_th code which sets page->mapping, index is was implemented out of concern that some aspect of the page fault logic may encounter unexpected problems should they not. However, the appropriate interface for inserting kernel-allocated memory is vm_insert_page() in a VM_MIXEDMAP. By using the helper function vmf_insert_mixed() we can do this with minimal churn in the existing fault handler. By doing so, we bypass the remainder of the faulting logic. The pages are still pinned so there is no possibility of anything unexpected being done with the pages once established. It would also be reasonable to pre-map everything on fault, however to minimise churn we retain the fault handler. We also eliminate all code which clears page->mapping on teardown as this has now become unnecessary. The MSU code relies on faulting to function correctly, so is by definition dependent on CONFIG_MMU. We avoid spurious reports about compilation failure for unsupported platforms by making this requirement explicit in Kconfig as part of this change too. Signed-off-by: Lorenzo Stoakes Acked-by: Alexander Shishkin Link: https://lore.kernel.org/r/20250331125608.60300-1-lorenzo.stoakes@oracle.com Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/Kconfig | 1 + drivers/hwtracing/intel_th/msu.c | 31 +++++++------------------------ 2 files changed, 8 insertions(+), 24 deletions(-) diff --git a/drivers/hwtracing/intel_th/Kconfig b/drivers/hwtracing/intel_th/Kconfig index 4b6359326ede..4f7d2b6d79e2 100644 --- a/drivers/hwtracing/intel_th/Kconfig +++ b/drivers/hwtracing/intel_th/Kconfig @@ -60,6 +60,7 @@ config INTEL_TH_STH config INTEL_TH_MSU tristate "Intel(R) Trace Hub Memory Storage Unit" + depends on MMU help Memory Storage Unit (MSU) trace output device enables storing STP traces to system memory. It supports single diff --git a/drivers/hwtracing/intel_th/msu.c b/drivers/hwtracing/intel_th/msu.c index bf99d79a4192..7163950eb371 100644 --- a/drivers/hwtracing/intel_th/msu.c +++ b/drivers/hwtracing/intel_th/msu.c @@ -19,6 +19,7 @@ #include #include #include +#include #ifdef CONFIG_X86 #include @@ -976,7 +977,6 @@ static void msc_buffer_contig_free(struct msc *msc) for (off = 0; off < msc->nr_pages << PAGE_SHIFT; off += PAGE_SIZE) { struct page *page = virt_to_page(msc->base + off); - page->mapping = NULL; __free_page(page); } @@ -1158,9 +1158,6 @@ static void __msc_buffer_win_free(struct msc *msc, struct msc_window *win) int i; for_each_sg(win->sgt->sgl, sg, win->nr_segs, i) { - struct page *page = msc_sg_page(sg); - - page->mapping = NULL; dma_free_coherent(msc_dev(win->msc)->parent->parent, PAGE_SIZE, sg_virt(sg), sg_dma_address(sg)); } @@ -1601,22 +1598,10 @@ static void msc_mmap_close(struct vm_area_struct *vma) { struct msc_iter *iter = vma->vm_file->private_data; struct msc *msc = iter->msc; - unsigned long pg; if (!atomic_dec_and_mutex_lock(&msc->mmap_count, &msc->buf_mutex)) return; - /* drop page _refcounts */ - for (pg = 0; pg < msc->nr_pages; pg++) { - struct page *page = msc_buffer_get_page(msc, pg); - - if (WARN_ON_ONCE(!page)) - continue; - - if (page->mapping) - page->mapping = NULL; - } - /* last mapping -- drop user_count */ atomic_dec(&msc->user_count); mutex_unlock(&msc->buf_mutex); @@ -1626,16 +1611,14 @@ static vm_fault_t msc_mmap_fault(struct vm_fault *vmf) { struct msc_iter *iter = vmf->vma->vm_file->private_data; struct msc *msc = iter->msc; + struct page *page; - vmf->page = msc_buffer_get_page(msc, vmf->pgoff); - if (!vmf->page) + page = msc_buffer_get_page(msc, vmf->pgoff); + if (!page) return VM_FAULT_SIGBUS; - get_page(vmf->page); - vmf->page->mapping = vmf->vma->vm_file->f_mapping; - vmf->page->index = vmf->pgoff; - - return 0; + get_page(page); + return vmf_insert_mixed(vmf->vma, vmf->address, page_to_pfn_t(page)); } static const struct vm_operations_struct msc_mmap_ops = { @@ -1676,7 +1659,7 @@ out: atomic_dec(&msc->user_count); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY); + vm_flags_set(vma, VM_DONTEXPAND | VM_DONTCOPY | VM_MIXEDMAP); vma->vm_ops = &msc_mmap_ops; return ret; } -- cgit v1.2.3 From 332ec18d57de2f77f43a988cbf1cb7693409434a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 14 Apr 2025 19:40:48 +0200 Subject: MAINTAINERS: update the location of the driver-core git tree The driver core git tree has moved, so properly document it. Cc: Rafael J. Wysocki Cc: Danilo Krummrich Cc: Tejun Heo Cc: Dave Ertman Cc: Ira Weiny Link: https://lore.kernel.org/r/2025041447-showbiz-other-7130@gregkh Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 778d7d168be3..01f86f5c5f81 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3868,7 +3868,7 @@ M: Greg Kroah-Hartman R: Dave Ertman R: Ira Weiny S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: Documentation/driver-api/auxiliary_bus.rst F: drivers/base/auxiliary.c F: include/linux/auxiliary_bus.h @@ -7225,7 +7225,7 @@ M: Greg Kroah-Hartman M: "Rafael J. Wysocki" M: Danilo Krummrich S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: Documentation/core-api/kobject.rst F: drivers/base/ F: fs/debugfs/ @@ -13106,7 +13106,7 @@ KERNFS M: Greg Kroah-Hartman M: Tejun Heo S: Supported -T: git git://git.kernel.org/pub/scm/linux/kernel/git/gregkh/driver-core.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: fs/kernfs/ F: include/linux/kernfs.h -- cgit v1.2.3 From 37ffdbd695c02189dbf23d6e7d2385e0299587ca Mon Sep 17 00:00:00 2001 From: Miao Li Date: Mon, 14 Apr 2025 14:29:35 +0800 Subject: usb: quirks: Add delay init quirk for SanDisk 3.2Gen1 Flash Drive The SanDisk 3.2Gen1 Flash Drive, which VID:PID is in 0781:55a3, just like Silicon Motion Flash Drive: https://lore.kernel.org/r/20250401023027.44894-1-limiao870622@163.com also needs the DELAY_INIT quirk, or it will randomly work incorrectly (e.g.: lsusb and can't list this device info) when connecting Huawei hisi platforms and doing thousand of reboot test circles. Cc: stable Signed-off-by: Miao Li Signed-off-by: Lei Huang Link: https://lore.kernel.org/r/20250414062935.159024-1-limiao870622@163.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 87b48c176160..36d3df7d040c 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -369,6 +369,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x0781, 0x5583), .driver_info = USB_QUIRK_NO_LPM }, { USB_DEVICE(0x0781, 0x5591), .driver_info = USB_QUIRK_NO_LPM }, + /* SanDisk Corp. SanDisk 3.2Gen1 */ + { USB_DEVICE(0x0781, 0x55a3), .driver_info = USB_QUIRK_DELAY_INIT }, + /* Realforce 87U Keyboard */ { USB_DEVICE(0x0853, 0x011b), .driver_info = USB_QUIRK_NO_LPM }, -- cgit v1.2.3 From 429a98abfc01d3d4378b7a00969437dc3e8f647c Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 15 Apr 2025 13:45:08 +0300 Subject: usb: typec: class: Unlocked on error in typec_register_partner() We recently added some locking to this function but this error path was accidentally missed. Unlock before returning. Fixes: ec27386de23a ("usb: typec: class: Fix NULL pointer access") Cc: stable Signed-off-by: Dan Carpenter Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/Z_44tOtmml89wQcM@stanley.mountain Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/class.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/typec/class.c b/drivers/usb/typec/class.c index 3df3e3736916..67a533e35150 100644 --- a/drivers/usb/typec/class.c +++ b/drivers/usb/typec/class.c @@ -1056,6 +1056,7 @@ struct typec_partner *typec_register_partner(struct typec_port *port, ret = device_register(&partner->dev); if (ret) { dev_err(&port->dev, "failed to register partner (%d)\n", ret); + mutex_unlock(&port->partner_link_lock); put_device(&partner->dev); return ERR_PTR(ret); } -- cgit v1.2.3 From e1ca3ff28ab1e2c1e70713ef3fa7943c725742c3 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Sat, 12 Apr 2025 09:18:47 +0900 Subject: serial: sifive: lock port in startup()/shutdown() callbacks startup()/shutdown() callbacks access SIFIVE_SERIAL_IE_OFFS. The register is also accessed from write() callback. If console were printing and startup()/shutdown() callback gets called, its access to the register could be overwritten. Add port->lock to startup()/shutdown() callbacks to make sure their access to SIFIVE_SERIAL_IE_OFFS is synchronized against write() callback. Fixes: 45c054d0815b ("tty: serial: add driver for the SiFive UART") Signed-off-by: Ryo Takakura Reviewed-by: Petr Mladek Cc: stable@vger.kernel.org Reviewed-by: John Ogness Rule: add Link: https://lore.kernel.org/stable/20250330003522.386632-1-ryotkkr98%40gmail.com Link: https://lore.kernel.org/r/20250412001847.183221-1-ryotkkr98@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sifive.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/tty/serial/sifive.c b/drivers/tty/serial/sifive.c index 5904a2d4cefa..054a8e630ace 100644 --- a/drivers/tty/serial/sifive.c +++ b/drivers/tty/serial/sifive.c @@ -563,8 +563,11 @@ static void sifive_serial_break_ctl(struct uart_port *port, int break_state) static int sifive_serial_startup(struct uart_port *port) { struct sifive_serial_port *ssp = port_to_sifive_serial_port(port); + unsigned long flags; + uart_port_lock_irqsave(&ssp->port, &flags); __ssp_enable_rxwm(ssp); + uart_port_unlock_irqrestore(&ssp->port, flags); return 0; } @@ -572,9 +575,12 @@ static int sifive_serial_startup(struct uart_port *port) static void sifive_serial_shutdown(struct uart_port *port) { struct sifive_serial_port *ssp = port_to_sifive_serial_port(port); + unsigned long flags; + uart_port_lock_irqsave(&ssp->port, &flags); __ssp_disable_rxwm(ssp); __ssp_disable_txwm(ssp); + uart_port_unlock_irqrestore(&ssp->port, flags); } /** -- cgit v1.2.3 From 170d1a3738908eef6a0dbf378ea77fb4ae8e294d Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Tue, 25 Mar 2025 18:49:00 +0000 Subject: binder: fix offset calculation in debug log The vma start address should be substracted from the buffer's user data address and not the other way around. Cc: Tiffany Y. Yang Cc: stable Fixes: 162c79731448 ("binder: avoid user addresses in debug logs") Signed-off-by: Carlos Llamas Reviewed-by: Tiffany Y. Yang Link: https://lore.kernel.org/r/20250325184902.587138-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 76052006bd87..5fc2c8ee61b1 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -6373,7 +6373,7 @@ static void print_binder_transaction_ilocked(struct seq_file *m, seq_printf(m, " node %d", buffer->target_node->debug_id); seq_printf(m, " size %zd:%zd offset %lx\n", buffer->data_size, buffer->offsets_size, - proc->alloc.vm_start - buffer->user_data); + buffer->user_data - proc->alloc.vm_start); } static void print_binder_work_ilocked(struct seq_file *m, -- cgit v1.2.3 From 44d9b3f584c59a606b521e7274e658d5b866c699 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Tue, 15 Apr 2025 13:39:01 +0100 Subject: comedi: jr3_pci: Fix synchronous deletion of timer When `jr3_pci_detach()` is called during device removal, it calls `timer_delete_sync()` to stop the timer, but the timer expiry function always reschedules the timer, so the synchronization is ineffective. Call `timer_shutdown_sync()` instead. It does not matter that the timer expiry function pointer is cleared, because the device is being removed. Fixes: 07b509e6584a5 ("Staging: comedi: add jr3_pci driver") Cc: stable Signed-off-by: Ian Abbott Link: https://lore.kernel.org/r/20250415123901.13483-1-abbotti@mev.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/comedi/drivers/jr3_pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/comedi/drivers/jr3_pci.c b/drivers/comedi/drivers/jr3_pci.c index cdc842b32bab..75dce1ff2419 100644 --- a/drivers/comedi/drivers/jr3_pci.c +++ b/drivers/comedi/drivers/jr3_pci.c @@ -758,7 +758,7 @@ static void jr3_pci_detach(struct comedi_device *dev) struct jr3_pci_dev_private *devpriv = dev->private; if (devpriv) - timer_delete_sync(&devpriv->timer); + timer_shutdown_sync(&devpriv->timer); comedi_pci_detach(dev); } -- cgit v1.2.3 From 86ce5c0a1dec02e21b4c864b2bc0cc5880a2c13c Mon Sep 17 00:00:00 2001 From: Alexander Usyskin Date: Tue, 8 Apr 2025 16:00:05 +0300 Subject: mei: me: add panther lake H DID Add Panther Lake H device id. Cc: stable Co-developed-by: Tomas Winkler Signed-off-by: Tomas Winkler Signed-off-by: Alexander Usyskin Link: https://lore.kernel.org/r/20250408130005.1358140-1-alexander.usyskin@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me-regs.h | 1 + drivers/misc/mei/pci-me.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index a5f88ec97df7..bc40b940ae21 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -117,6 +117,7 @@ #define MEI_DEV_ID_LNL_M 0xA870 /* Lunar Lake Point M */ +#define MEI_DEV_ID_PTL_H 0xE370 /* Panther Lake H */ #define MEI_DEV_ID_PTL_P 0xE470 /* Panther Lake P */ /* diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index d6ff9d82ae94..3f9c60b579ae 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -124,6 +124,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_LNL_M, MEI_ME_PCH15_CFG)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_PTL_H, MEI_ME_PCH15_CFG)}, {MEI_PCI_DEVICE(MEI_DEV_ID_PTL_P, MEI_ME_PCH15_CFG)}, /* required last entry */ -- cgit v1.2.3 From c876be906ce7e518d9ef9926478669c151999e69 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Mon, 17 Mar 2025 10:59:55 -0300 Subject: char: misc: register chrdev region with all possible minors register_chrdev will only register the first 256 minors of a major chrdev. That means that dynamically allocated misc devices with minor above 255 will fail to open with -ENXIO. This was found by kernel test robot when testing a different change that makes all dynamically allocated minors be above 255. This has, however, been separately tested by creating 256 serio_raw devices with the help of userio driver. Ever since allowing misc devices with minors above 128, this has been possible. Fix it by registering all minor numbers from 0 to MINORMASK + 1 for MISC_MAJOR. Reported-by: kernel test robot Cc: stable Closes: https://lore.kernel.org/oe-lkp/202503171507.6c8093d0-lkp@intel.com Fixes: ab760791c0cf ("char: misc: Increase the maximum number of dynamic misc devices to 1048448") Signed-off-by: Thadeu Lima de Souza Cascardo Tested-by: Hou Wenlong Link: https://lore.kernel.org/r/20250317-misc-chrdev-v1-1-6cd05da11aef@igalia.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/misc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/misc.c b/drivers/char/misc.c index f7dd455dd0dd..dda466f9181a 100644 --- a/drivers/char/misc.c +++ b/drivers/char/misc.c @@ -315,7 +315,7 @@ static int __init misc_init(void) goto fail_remove; err = -EIO; - if (register_chrdev(MISC_MAJOR, "misc", &misc_fops)) + if (__register_chrdev(MISC_MAJOR, 0, MINORMASK + 1, "misc", &misc_fops)) goto fail_printk; return 0; -- cgit v1.2.3 From 18eb77c75ed01439f96ae5c0f33461eb5134b907 Mon Sep 17 00:00:00 2001 From: Rengarajan S Date: Thu, 13 Mar 2025 22:38:55 +0530 Subject: misc: microchip: pci1xxxx: Fix Kernel panic during IRQ handler registration Resolve kernel panic while accessing IRQ handler associated with the generated IRQ. This is done by acquiring the spinlock and storing the current interrupt state before handling the interrupt request using generic_handle_irq. A previous fix patch was submitted where 'generic_handle_irq' was replaced with 'handle_nested_irq'. However, this change also causes the kernel panic where after determining which GPIO triggered the interrupt and attempting to call handle_nested_irq with the mapped IRQ number, leads to a failure in locating the registered handler. Fixes: 194f9f94a516 ("misc: microchip: pci1xxxx: Resolve kernel panic during GPIO IRQ handling") Cc: stable Signed-off-by: Rengarajan S Link: https://lore.kernel.org/r/20250313170856.20868-2-rengarajan.s@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c index 04756302b878..21255cdb24c1 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c @@ -37,6 +37,7 @@ struct pci1xxxx_gpio { struct auxiliary_device *aux_dev; void __iomem *reg_base; + raw_spinlock_t wa_lock; struct gpio_chip gpio; spinlock_t lock; int irq_base; @@ -257,6 +258,7 @@ static irqreturn_t pci1xxxx_gpio_irq_handler(int irq, void *dev_id) struct pci1xxxx_gpio *priv = dev_id; struct gpio_chip *gc = &priv->gpio; unsigned long int_status = 0; + unsigned long wa_flags; unsigned long flags; u8 pincount; int bit; @@ -280,7 +282,9 @@ static irqreturn_t pci1xxxx_gpio_irq_handler(int irq, void *dev_id) writel(BIT(bit), priv->reg_base + INTR_STATUS_OFFSET(gpiobank)); spin_unlock_irqrestore(&priv->lock, flags); irq = irq_find_mapping(gc->irq.domain, (bit + (gpiobank * 32))); - handle_nested_irq(irq); + raw_spin_lock_irqsave(&priv->wa_lock, wa_flags); + generic_handle_irq(irq); + raw_spin_unlock_irqrestore(&priv->wa_lock, wa_flags); } } spin_lock_irqsave(&priv->lock, flags); -- cgit v1.2.3 From e9d7748a7468581859d2b85b378135f9688a0aff Mon Sep 17 00:00:00 2001 From: Rengarajan S Date: Thu, 13 Mar 2025 22:38:56 +0530 Subject: misc: microchip: pci1xxxx: Fix incorrect IRQ status handling during ack Under irq_ack, pci1xxxx_assign_bit reads the current interrupt status, modifies and writes the entire value back. Since, the IRQ status bit gets cleared on writing back, the better approach is to directly write the bitmask to the register in order to preserve the value. Fixes: 1f4d8ae231f4 ("misc: microchip: pci1xxxx: Add gpio irq handler and irq helper functions irq_ack, irq_mask, irq_unmask and irq_set_type of irq_chip.") Cc: stable Signed-off-by: Rengarajan S Link: https://lore.kernel.org/r/20250313170856.20868-3-rengarajan.s@microchip.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c index 21255cdb24c1..98d3d123004c 100644 --- a/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c +++ b/drivers/misc/mchp_pci1xxxx/mchp_pci1xxxx_gpio.c @@ -168,7 +168,7 @@ static void pci1xxxx_gpio_irq_ack(struct irq_data *data) unsigned long flags; spin_lock_irqsave(&priv->lock, flags); - pci1xxx_assign_bit(priv->reg_base, INTR_STAT_OFFSET(gpio), (gpio % 32), true); + writel(BIT(gpio % 32), priv->reg_base + INTR_STAT_OFFSET(gpio)); spin_unlock_irqrestore(&priv->lock, flags); } -- cgit v1.2.3 From dc1771f718548f7d4b93991b174c6e7b5e1ba410 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Mar 2025 22:24:14 -0700 Subject: Revert "drivers: core: synchronize really_probe() and dev_uevent()" This reverts commit c0a40097f0bc81deafc15f9195d1fb54595cd6d0. Probing a device can take arbitrary long time. In the field we observed that, for example, probing a bad micro-SD cards in an external USB card reader (or maybe cards were good but cables were flaky) sometimes takes longer than 2 minutes due to multiple retries at various levels of the stack. We can not block uevent_show() method for that long because udev is reading that attribute very often and that blocks udev and interferes with booting of the system. The change that introduced locking was concerned with dev_uevent() racing with unbinding the driver. However we can handle it without locking (which will be done in subsequent patch). There was also claim that synchronization with probe() is needed to properly load USB drivers, however this is a red herring: the change adding the lock was introduced in May of last year and USB loading and probing worked properly for many years before that. Revert the harmful locking. Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250311052417.1846985-1-dmitry.torokhov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/core.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index d2f9d3a59d6b..f9c1c623bca5 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2726,11 +2726,8 @@ static ssize_t uevent_show(struct device *dev, struct device_attribute *attr, if (!env) return -ENOMEM; - /* Synchronize with really_probe() */ - device_lock(dev); /* let the kset specific function add its keys */ retval = kset->uevent_ops->uevent(&dev->kobj, env); - device_unlock(dev); if (retval) goto out; -- cgit v1.2.3 From 04d3e5461c1f5cf8eec964ab64948ebed826e95e Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Mar 2025 22:24:15 -0700 Subject: driver core: introduce device_set_driver() helper In preparation to closing a race when reading driver pointer in dev_uevent() code, instead of setting device->driver pointer directly introduce device_set_driver() helper. Signed-off-by: Dmitry Torokhov Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250311052417.1846985-2-dmitry.torokhov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/base.h | 6 ++++++ drivers/base/core.c | 2 +- drivers/base/dd.c | 7 +++---- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index 0042e4774b0c..eb203cf8370b 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -180,6 +180,12 @@ int driver_add_groups(const struct device_driver *drv, const struct attribute_gr void driver_remove_groups(const struct device_driver *drv, const struct attribute_group **groups); void device_driver_detach(struct device *dev); +static inline void device_set_driver(struct device *dev, const struct device_driver *drv) +{ + // FIXME - this cast should not be needed "soon" + dev->driver = (struct device_driver *)drv; +} + int devres_release_all(struct device *dev); void device_block_probing(void); void device_unblock_probing(void); diff --git a/drivers/base/core.c b/drivers/base/core.c index f9c1c623bca5..b000ee61c149 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3697,7 +3697,7 @@ done: device_pm_remove(dev); dpm_sysfs_remove(dev); DPMError: - dev->driver = NULL; + device_set_driver(dev, NULL); bus_remove_device(dev); BusError: device_remove_attrs(dev); diff --git a/drivers/base/dd.c b/drivers/base/dd.c index f0e4b4aba885..b526e0e0f52d 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -550,7 +550,7 @@ static void device_unbind_cleanup(struct device *dev) arch_teardown_dma_ops(dev); kfree(dev->dma_range_map); dev->dma_range_map = NULL; - dev->driver = NULL; + device_set_driver(dev, NULL); dev_set_drvdata(dev, NULL); if (dev->pm_domain && dev->pm_domain->dismiss) dev->pm_domain->dismiss(dev); @@ -629,8 +629,7 @@ static int really_probe(struct device *dev, const struct device_driver *drv) } re_probe: - // FIXME - this cast should not be needed "soon" - dev->driver = (struct device_driver *)drv; + device_set_driver(dev, drv); /* If using pinctrl, bind pins now before probing */ ret = pinctrl_bind_pins(dev); @@ -1014,7 +1013,7 @@ static int __device_attach(struct device *dev, bool allow_async) if (ret == 0) ret = 1; else { - dev->driver = NULL; + device_set_driver(dev, NULL); ret = 0; } } else { -- cgit v1.2.3 From 18daa52418e7e4629ed1703b64777294209d2622 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Mon, 10 Mar 2025 22:24:16 -0700 Subject: driver core: fix potential NULL pointer dereference in dev_uevent() If userspace reads "uevent" device attribute at the same time as another threads unbinds the device from its driver, change to dev->driver from a valid pointer to NULL may result in crash. Fix this by using READ_ONCE() when fetching the pointer, and take bus' drivers klist lock to make sure driver instance will not disappear while we access it. Use WRITE_ONCE() when setting the driver pointer to ensure there is no tearing. Signed-off-by: Dmitry Torokhov Reviewed-by: Masami Hiramatsu (Google) Link: https://lore.kernel.org/r/20250311052417.1846985-3-dmitry.torokhov@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/base.h | 13 ++++++++++++- drivers/base/bus.c | 2 +- drivers/base/core.c | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 44 insertions(+), 4 deletions(-) diff --git a/drivers/base/base.h b/drivers/base/base.h index eb203cf8370b..123031a757d9 100644 --- a/drivers/base/base.h +++ b/drivers/base/base.h @@ -73,6 +73,7 @@ static inline void subsys_put(struct subsys_private *sp) kset_put(&sp->subsys); } +struct subsys_private *bus_to_subsys(const struct bus_type *bus); struct subsys_private *class_to_subsys(const struct class *class); struct driver_private { @@ -182,8 +183,18 @@ void device_driver_detach(struct device *dev); static inline void device_set_driver(struct device *dev, const struct device_driver *drv) { + /* + * Majority (all?) read accesses to dev->driver happens either + * while holding device lock or in bus/driver code that is only + * invoked when the device is bound to a driver and there is no + * concern of the pointer being changed while it is being read. + * However when reading device's uevent file we read driver pointer + * without taking device lock (so we do not block there for + * arbitrary amount of time). We use WRITE_ONCE() here to prevent + * tearing so that READ_ONCE() can safely be used in uevent code. + */ // FIXME - this cast should not be needed "soon" - dev->driver = (struct device_driver *)drv; + WRITE_ONCE(dev->driver, (struct device_driver *)drv); } int devres_release_all(struct device *dev); diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 5ea3b03af9ba..5e75e1bce551 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -57,7 +57,7 @@ static int __must_check bus_rescan_devices_helper(struct device *dev, * NULL. A call to subsys_put() must be done when finished with the pointer in * order for it to be properly freed. */ -static struct subsys_private *bus_to_subsys(const struct bus_type *bus) +struct subsys_private *bus_to_subsys(const struct bus_type *bus) { struct subsys_private *sp = NULL; struct kobject *kobj; diff --git a/drivers/base/core.c b/drivers/base/core.c index b000ee61c149..cbc0099d8ef2 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -2624,6 +2624,35 @@ static const char *dev_uevent_name(const struct kobject *kobj) return NULL; } +/* + * Try filling "DRIVER=" uevent variable for a device. Because this + * function may race with binding and unbinding the device from a driver, + * we need to be careful. Binding is generally safe, at worst we miss the + * fact that the device is already bound to a driver (but the driver + * information that is delivered through uevents is best-effort, it may + * become obsolete as soon as it is generated anyways). Unbinding is more + * risky as driver pointer is transitioning to NULL, so READ_ONCE() should + * be used to make sure we are dealing with the same pointer, and to + * ensure that driver structure is not going to disappear from under us + * we take bus' drivers klist lock. The assumption that only registered + * driver can be bound to a device, and to unregister a driver bus code + * will take the same lock. + */ +static void dev_driver_uevent(const struct device *dev, struct kobj_uevent_env *env) +{ + struct subsys_private *sp = bus_to_subsys(dev->bus); + + if (sp) { + scoped_guard(spinlock, &sp->klist_drivers.k_lock) { + struct device_driver *drv = READ_ONCE(dev->driver); + if (drv) + add_uevent_var(env, "DRIVER=%s", drv->name); + } + + subsys_put(sp); + } +} + static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) { const struct device *dev = kobj_to_dev(kobj); @@ -2655,8 +2684,8 @@ static int dev_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) if (dev->type && dev->type->name) add_uevent_var(env, "DEVTYPE=%s", dev->type->name); - if (dev->driver) - add_uevent_var(env, "DRIVER=%s", dev->driver->name); + /* Add "DRIVER=%s" variable if the device is bound to a driver */ + dev_driver_uevent(dev, env); /* Add common DT information about the device */ of_device_uevent(dev, env); -- cgit v1.2.3 From 10076ae01388caffec3b90abcce05f24a9e4b15e Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Apr 2025 15:28:46 +0300 Subject: drivers/base: Extend documentation with preferred way to use auxbus Document the preferred way to use auxiliary bus. Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/206e8c249f630abd3661deb36b84b26282241040.1743510317.git.leon@kernel.org [ reworded the text a bit - gregkh ] Signed-off-by: Greg Kroah-Hartman --- drivers/base/auxiliary.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/base/auxiliary.c b/drivers/base/auxiliary.c index afa4df4c5a3f..95717d509ca9 100644 --- a/drivers/base/auxiliary.c +++ b/drivers/base/auxiliary.c @@ -156,6 +156,16 @@ * }, * .ops = my_custom_ops, * }; + * + * Please note that such custom ops approach is valid, but it is hard to implement + * it right without global locks per-device to protect from auxiliary_drv removal + * during call to that ops. In addition, this implementation lacks proper module + * dependency, which causes to load/unload races between auxiliary parent and devices + * modules. + * + * The most easiest way to provide these ops reliably without needing to + * have a lock is to EXPORT_SYMBOL*() them and rely on already existing + * modules infrastructure for validity and correct dependencies chains. */ static const struct auxiliary_device_id *auxiliary_match_id(const struct auxiliary_device_id *id, -- cgit v1.2.3 From a8e858e29955175ab7587190d449fa6a566d90e5 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Tue, 1 Apr 2025 15:28:47 +0300 Subject: drivers/base: Add myself as auxiliary bus reviewer As the one who participated in initial development of auxiliary bus and later reviewed many of existing auxiliary bus consumers, I would like to be CCed on all auxiliary bus changes. Add myself as a reviewer to do not miss new development in that area. Signed-off-by: Leon Romanovsky Link: https://lore.kernel.org/r/b60e74e286b1d3935de46092470f716701c924a1.1743510317.git.leon@kernel.org Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 01f86f5c5f81..29c9ea9c0a3c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3867,6 +3867,7 @@ AUXILIARY BUS DRIVER M: Greg Kroah-Hartman R: Dave Ertman R: Ira Weiny +R: Leon Romanovsky S: Supported T: git git://git.kernel.org/pub/scm/linux/kernel/git/driver-core/driver-core.git F: Documentation/driver-api/auxiliary_bus.rst -- cgit v1.2.3 From 1ae5e4c0626d6954114d9725990ac9c498f3b1ab Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 8 Apr 2025 12:48:55 +0300 Subject: device property: Add a note to the fwnode.h Add a note to the fwnode.h that the header should not be used directly in the leaf drivers, they all should use the higher level APIs and the respective headers. The purpose of this note is to give guidance to driver writers to avoid repeating a common mistake. Signed-off-by: Andy Shevchenko Reviewed-by: Sakari Ailus Reviewed-by: Rafael J. Wysocki Reviewed-by: Zijun Hu Link: https://lore.kernel.org/r/20250408095229.1298005-1-andriy.shevchenko@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- include/linux/fwnode.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/include/linux/fwnode.h b/include/linux/fwnode.h index 6fa0a268d538..097be89487bf 100644 --- a/include/linux/fwnode.h +++ b/include/linux/fwnode.h @@ -2,6 +2,11 @@ /* * fwnode.h - Firmware device node object handle type definition. * + * This header file provides low-level data types and definitions for firmware + * and device property providers. The respective API header files supplied by + * them should contain all of the requisite data types and definitions for end + * users, so including it directly should not be necessary. + * * Copyright (C) 2015, Intel Corporation * Author: Rafael J. Wysocki */ -- cgit v1.2.3 From bc2c46426f2d95e58c82f394531afdd034c8706c Mon Sep 17 00:00:00 2001 From: Lizhi Xu Date: Mon, 14 Apr 2025 15:11:23 +0800 Subject: software node: Prevent link creation failure from causing kobj reference count imbalance syzbot reported a uaf in software_node_notify_remove. [1] When any of the two sysfs_create_link() in software_node_notify() fails, the swnode->kobj reference count will not increase normally, which will cause swnode to be released incorrectly due to the imbalance of kobj reference count when executing software_node_notify_remove(). Increase the reference count of kobj before creating the link to avoid uaf. [1] BUG: KASAN: slab-use-after-free in software_node_notify_remove+0x1bc/0x1c0 drivers/base/swnode.c:1108 Read of size 1 at addr ffff888033c08908 by task syz-executor105/5844 Freed by task 5844: software_node_notify_remove+0x159/0x1c0 drivers/base/swnode.c:1106 device_platform_notify_remove drivers/base/core.c:2387 [inline] Fixes: 9eb59204d519 ("iommufd/selftest: Add set_dev_pasid in mock iommu") Reported-by: syzbot+2ff22910687ee0dfd48e@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=2ff22910687ee0dfd48e Tested-by: syzbot+2ff22910687ee0dfd48e@syzkaller.appspotmail.com Signed-off-by: Lizhi Xu Reviewed-by: Sakari Ailus Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250414071123.1228331-1-lizhi.xu@windriver.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/swnode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index b1726a3515f6..5c78fa6ae772 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -1080,6 +1080,7 @@ void software_node_notify(struct device *dev) if (!swnode) return; + kobject_get(&swnode->kobj); ret = sysfs_create_link(&dev->kobj, &swnode->kobj, "software_node"); if (ret) return; @@ -1089,8 +1090,6 @@ void software_node_notify(struct device *dev) sysfs_remove_link(&dev->kobj, "software_node"); return; } - - kobject_get(&swnode->kobj); } void software_node_notify_remove(struct device *dev) -- cgit v1.2.3 From b9792abb76ae1649080b8d48092c52e24c7bbdc2 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Thu, 10 Apr 2025 22:51:10 +1000 Subject: drivers/base/memory: Avoid overhead from for_each_present_section_nr() for_each_present_section_nr() was introduced to add_boot_memory_block() by commit 61659efdb35c ("drivers/base/memory: improve add_boot_memory_block()"). It causes unnecessary overhead when the present sections are really sparse. next_present_section_nr() called by the macro to find the next present section, which is far away from the spanning sections in the specified block. Too much time consumed by next_present_section_nr() in this case, which can lead to softlockup as observed by Aditya Gupta on IBM Power10 machine. watchdog: BUG: soft lockup - CPU#248 stuck for 22s! [swapper/248:1] Modules linked in: CPU: 248 UID: 0 PID: 1 Comm: swapper/248 Not tainted 6.15.0-rc1-next-20250408 #1 VOLUNTARY Hardware name: 9105-22A POWER10 (raw) 0x800200 opal:v7.1-107-gfda75d121942 PowerNV NIP: c00000000209218c LR: c000000002092204 CTR: 0000000000000000 REGS: c00040000418fa30 TRAP: 0900 Not tainted (6.15.0-rc1-next-20250408) MSR: 9000000002009033 CR: 28000428 XER: 00000000 CFAR: 0000000000000000 IRQMASK: 0 GPR00: c000000002092204 c00040000418fcd0 c000000001b08100 0000000000000040 GPR04: 0000000000013e00 c000c03ffebabb00 0000000000c03fff c000400fff587f80 GPR08: 0000000000000000 00000000001196f7 0000000000000000 0000000028000428 GPR12: 0000000000000000 c000000002e80000 c00000000001007c 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR28: c000000002df7f70 0000000000013dc0 c0000000011dd898 0000000008000000 NIP [c00000000209218c] memory_dev_init+0x114/0x1e0 LR [c000000002092204] memory_dev_init+0x18c/0x1e0 Call Trace: [c00040000418fcd0] [c000000002092204] memory_dev_init+0x18c/0x1e0 (unreliable) [c00040000418fd50] [c000000002091348] driver_init+0x78/0xa4 [c00040000418fd70] [c0000000020063ac] kernel_init_freeable+0x22c/0x370 [c00040000418fde0] [c0000000000100a8] kernel_init+0x34/0x25c [c00040000418fe50] [c00000000000cd94] ret_from_kernel_user_thread+0x14/0x1c Avoid the overhead by folding for_each_present_section_nr() to the outer loop. add_boot_memory_block() is dropped after that. Fixes: 61659efdb35c ("drivers/base/memory: improve add_boot_memory_block()") Closes: https://lore.kernel.org/linux-mm/20250409180344.477916-1-adityag@linux.ibm.com Reported-by: Aditya Gupta Signed-off-by: Gavin Shan Acked-by: Oscar Salvador Tested-by: Aditya Gupta Acked-by: David Hildenbrand Link: https://lore.kernel.org/r/20250410125110.1232329-1-gshan@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/base/memory.c | 41 +++++++++++++++++------------------------ 1 file changed, 17 insertions(+), 24 deletions(-) diff --git a/drivers/base/memory.c b/drivers/base/memory.c index 8f3a41d9bfaa..19469e7f88c2 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -816,21 +816,6 @@ static int add_memory_block(unsigned long block_id, unsigned long state, return 0; } -static int __init add_boot_memory_block(unsigned long base_section_nr) -{ - unsigned long nr; - - for_each_present_section_nr(base_section_nr, nr) { - if (nr >= (base_section_nr + sections_per_block)) - break; - - return add_memory_block(memory_block_id(base_section_nr), - MEM_ONLINE, NULL, NULL); - } - - return 0; -} - static int add_hotplug_memory_block(unsigned long block_id, struct vmem_altmap *altmap, struct memory_group *group) @@ -957,7 +942,7 @@ static const struct attribute_group *memory_root_attr_groups[] = { void __init memory_dev_init(void) { int ret; - unsigned long block_sz, nr; + unsigned long block_sz, block_id, nr; /* Validate the configured memory block size */ block_sz = memory_block_size_bytes(); @@ -970,15 +955,23 @@ void __init memory_dev_init(void) panic("%s() failed to register subsystem: %d\n", __func__, ret); /* - * Create entries for memory sections that were found - * during boot and have been initialized + * Create entries for memory sections that were found during boot + * and have been initialized. Use @block_id to track the last + * handled block and initialize it to an invalid value (ULONG_MAX) + * to bypass the block ID matching check for the first present + * block so that it can be covered. */ - for (nr = 0; nr <= __highest_present_section_nr; - nr += sections_per_block) { - ret = add_boot_memory_block(nr); - if (ret) - panic("%s() failed to add memory block: %d\n", __func__, - ret); + block_id = ULONG_MAX; + for_each_present_section_nr(0, nr) { + if (block_id != ULONG_MAX && memory_block_id(nr) == block_id) + continue; + + block_id = memory_block_id(nr); + ret = add_memory_block(block_id, MEM_ONLINE, NULL, NULL); + if (ret) { + panic("%s() failed to add memory block: %d\n", + __func__, ret); + } } } -- cgit v1.2.3 From 7c7f1bfdb2249f854a736d9b79778c7e5a29a150 Mon Sep 17 00:00:00 2001 From: Haoxiang Li Date: Mon, 10 Mar 2025 09:46:57 +0100 Subject: mcb: fix a double free bug in chameleon_parse_gdd() In chameleon_parse_gdd(), if mcb_device_register() fails, 'mdev' would be released in mcb_device_register() via put_device(). Thus, goto 'err' label and free 'mdev' again causes a double free. Just return if mcb_device_register() fails. Fixes: 3764e82e5150 ("drivers: Introduce MEN Chameleon Bus") Cc: stable Signed-off-by: Haoxiang Li Signed-off-by: Johannes Thumshirn Link: https://lore.kernel.org/r/6201d09e2975ae5789879f79a6de4c38de9edd4a.1741596225.git.jth@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/mcb/mcb-parse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mcb/mcb-parse.c b/drivers/mcb/mcb-parse.c index 02a680c73979..bf0d7d58c8b0 100644 --- a/drivers/mcb/mcb-parse.c +++ b/drivers/mcb/mcb-parse.c @@ -96,7 +96,7 @@ static int chameleon_parse_gdd(struct mcb_bus *bus, ret = mcb_device_register(bus, mdev); if (ret < 0) - goto err; + return ret; return 0; -- cgit v1.2.3 From bcfb443557166287ba544be308ed44d788599afa Mon Sep 17 00:00:00 2001 From: Raag Jadav Date: Tue, 18 Mar 2025 17:10:38 +0530 Subject: pps: generators: tio: fix platform_set_drvdata() platform_set_drvdata() is setting a double pointer to struct pps_tio as driver_data, which will point to the local stack of probe function instead of intended data. Set driver_data correctly and fix illegal memory access by its user. BUG: unable to handle page fault for address: ffffc9000117b738 RIP: 0010:hrtimer_active+0x2b/0x60 Call Trace: ? hrtimer_active+0x2b/0x60 hrtimer_cancel+0x19/0x50 pps_gen_tio_remove+0x1e/0x80 [pps_gen_tio] Fixes: c89755d1111f ("pps: generators: Add PPS Generator TIO Driver") Signed-off-by: Raag Jadav Acked-by: Rodolfo Giometti Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250318114038.2058677-1-raag.jadav@intel.com Signed-off-by: Greg Kroah-Hartman --- drivers/pps/generators/pps_gen_tio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pps/generators/pps_gen_tio.c b/drivers/pps/generators/pps_gen_tio.c index 1d5ffe055463..de00a85bfafa 100644 --- a/drivers/pps/generators/pps_gen_tio.c +++ b/drivers/pps/generators/pps_gen_tio.c @@ -230,7 +230,7 @@ static int pps_gen_tio_probe(struct platform_device *pdev) hrtimer_setup(&tio->timer, hrtimer_callback, CLOCK_REALTIME, HRTIMER_MODE_ABS); spin_lock_init(&tio->lock); - platform_set_drvdata(pdev, &tio); + platform_set_drvdata(pdev, tio); return 0; } -- cgit v1.2.3 From 00f1cc14da0f06d2897b8c528df7c7dcf1b8da50 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 18 Mar 2025 15:12:02 +0100 Subject: mei: vsc: Fix fortify-panic caused by invalid counted_by() use gcc 15 honors the __counted_by(len) attribute on vsc_tp_packet.buf[] and the vsc-tp.c code is using this in a wrong way. len does not contain the available size in the buffer, it contains the actual packet length *without* the crc. So as soon as vsc_tp_xfer() tries to add the crc to buf[] the fortify-panic handler gets triggered: [ 80.842193] memcpy: detected buffer overflow: 4 byte write of buffer size 0 [ 80.842243] WARNING: CPU: 4 PID: 272 at lib/string_helpers.c:1032 __fortify_report+0x45/0x50 ... [ 80.843175] __fortify_panic+0x9/0xb [ 80.843186] vsc_tp_xfer.cold+0x67/0x67 [mei_vsc_hw] [ 80.843210] ? seqcount_lockdep_reader_access.constprop.0+0x82/0x90 [ 80.843229] ? lockdep_hardirqs_on+0x7c/0x110 [ 80.843250] mei_vsc_hw_start+0x98/0x120 [mei_vsc] [ 80.843270] mei_reset+0x11d/0x420 [mei] The easiest fix would be to just drop the counted-by but with the exception of the ack buffer in vsc_tp_xfer_helper() which only contains enough room for the packet-header, all other uses of vsc_tp_packet always use a buffer of VSC_TP_MAX_XFER_SIZE bytes for the packet. Instead of just dropping the counted-by, split the vsc_tp_packet struct definition into a header and a full-packet definition and use a fixed size buf[] in the packet definition, this way fortify-source buffer overrun checking still works when enabled. Fixes: 566f5ca97680 ("mei: Add transport driver for IVSC device") Cc: stable@kernel.org Signed-off-by: Hans de Goede Reviewed-by: Alexander Usyskin Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20250318141203.94342-2-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/vsc-tp.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index 7be1649b1972..fa553d4914b6 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -36,20 +36,24 @@ #define VSC_TP_XFER_TIMEOUT_BYTES 700 #define VSC_TP_PACKET_PADDING_SIZE 1 #define VSC_TP_PACKET_SIZE(pkt) \ - (sizeof(struct vsc_tp_packet) + le16_to_cpu((pkt)->len) + VSC_TP_CRC_SIZE) + (sizeof(struct vsc_tp_packet_hdr) + le16_to_cpu((pkt)->hdr.len) + VSC_TP_CRC_SIZE) #define VSC_TP_MAX_PACKET_SIZE \ - (sizeof(struct vsc_tp_packet) + VSC_TP_MAX_MSG_SIZE + VSC_TP_CRC_SIZE) + (sizeof(struct vsc_tp_packet_hdr) + VSC_TP_MAX_MSG_SIZE + VSC_TP_CRC_SIZE) #define VSC_TP_MAX_XFER_SIZE \ (VSC_TP_MAX_PACKET_SIZE + VSC_TP_XFER_TIMEOUT_BYTES) #define VSC_TP_NEXT_XFER_LEN(len, offset) \ - (len + sizeof(struct vsc_tp_packet) + VSC_TP_CRC_SIZE - offset + VSC_TP_PACKET_PADDING_SIZE) + (len + sizeof(struct vsc_tp_packet_hdr) + VSC_TP_CRC_SIZE - offset + VSC_TP_PACKET_PADDING_SIZE) -struct vsc_tp_packet { +struct vsc_tp_packet_hdr { __u8 sync; __u8 cmd; __le16 len; __le32 seq; - __u8 buf[] __counted_by(len); +}; + +struct vsc_tp_packet { + struct vsc_tp_packet_hdr hdr; + __u8 buf[VSC_TP_MAX_XFER_SIZE - sizeof(struct vsc_tp_packet_hdr)]; }; struct vsc_tp { @@ -158,12 +162,12 @@ static int vsc_tp_dev_xfer(struct vsc_tp *tp, void *obuf, void *ibuf, size_t len static int vsc_tp_xfer_helper(struct vsc_tp *tp, struct vsc_tp_packet *pkt, void *ibuf, u16 ilen) { - int ret, offset = 0, cpy_len, src_len, dst_len = sizeof(struct vsc_tp_packet); + int ret, offset = 0, cpy_len, src_len, dst_len = sizeof(struct vsc_tp_packet_hdr); int next_xfer_len = VSC_TP_PACKET_SIZE(pkt) + VSC_TP_XFER_TIMEOUT_BYTES; u8 *src, *crc_src, *rx_buf = tp->rx_buf; int count_down = VSC_TP_MAX_XFER_COUNT; u32 recv_crc = 0, crc = ~0; - struct vsc_tp_packet ack; + struct vsc_tp_packet_hdr ack; u8 *dst = (u8 *)&ack; bool synced = false; @@ -280,10 +284,10 @@ int vsc_tp_xfer(struct vsc_tp *tp, u8 cmd, const void *obuf, size_t olen, guard(mutex)(&tp->mutex); - pkt->sync = VSC_TP_PACKET_SYNC; - pkt->cmd = cmd; - pkt->len = cpu_to_le16(olen); - pkt->seq = cpu_to_le32(++tp->seq); + pkt->hdr.sync = VSC_TP_PACKET_SYNC; + pkt->hdr.cmd = cmd; + pkt->hdr.len = cpu_to_le16(olen); + pkt->hdr.seq = cpu_to_le32(++tp->seq); memcpy(pkt->buf, obuf, olen); crc = ~crc32(~0, (u8 *)pkt, sizeof(pkt) + olen); -- cgit v1.2.3 From f88c0c72ffb014e5eba676ee337c4eb3b1d6a119 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Tue, 18 Mar 2025 15:12:03 +0100 Subject: mei: vsc: Use struct vsc_tp_packet as vsc-tp tx_buf and rx_buf type vsc_tp.tx_buf and vsc_tp.rx_buf point to a struct vsc_tp_packet, use the correct type instead of "void *" and use sizeof(*ptr) when allocating memory for these buffers. Signed-off-by: Hans de Goede Reviewed-by: Alexander Usyskin Reviewed-by: Sakari Ailus Link: https://lore.kernel.org/r/20250318141203.94342-3-hdegoede@redhat.com Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/vsc-tp.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/misc/mei/vsc-tp.c b/drivers/misc/mei/vsc-tp.c index fa553d4914b6..da26a080916c 100644 --- a/drivers/misc/mei/vsc-tp.c +++ b/drivers/misc/mei/vsc-tp.c @@ -71,8 +71,8 @@ struct vsc_tp { u32 seq; /* command buffer */ - void *tx_buf; - void *rx_buf; + struct vsc_tp_packet *tx_buf; + struct vsc_tp_packet *rx_buf; atomic_t assert_cnt; wait_queue_head_t xfer_wait; @@ -164,7 +164,7 @@ static int vsc_tp_xfer_helper(struct vsc_tp *tp, struct vsc_tp_packet *pkt, { int ret, offset = 0, cpy_len, src_len, dst_len = sizeof(struct vsc_tp_packet_hdr); int next_xfer_len = VSC_TP_PACKET_SIZE(pkt) + VSC_TP_XFER_TIMEOUT_BYTES; - u8 *src, *crc_src, *rx_buf = tp->rx_buf; + u8 *src, *crc_src, *rx_buf = (u8 *)tp->rx_buf; int count_down = VSC_TP_MAX_XFER_COUNT; u32 recv_crc = 0, crc = ~0; struct vsc_tp_packet_hdr ack; @@ -324,7 +324,7 @@ int vsc_tp_rom_xfer(struct vsc_tp *tp, const void *obuf, void *ibuf, size_t len) guard(mutex)(&tp->mutex); /* rom xfer is big endian */ - cpu_to_be32_array(tp->tx_buf, obuf, words); + cpu_to_be32_array((u32 *)tp->tx_buf, obuf, words); ret = read_poll_timeout(gpiod_get_value_cansleep, ret, !ret, VSC_TP_ROM_XFER_POLL_DELAY_US, @@ -340,7 +340,7 @@ int vsc_tp_rom_xfer(struct vsc_tp *tp, const void *obuf, void *ibuf, size_t len) return ret; if (ibuf) - be32_to_cpu_array(ibuf, tp->rx_buf, words); + be32_to_cpu_array(ibuf, (u32 *)tp->rx_buf, words); return ret; } @@ -494,11 +494,11 @@ static int vsc_tp_probe(struct spi_device *spi) if (!tp) return -ENOMEM; - tp->tx_buf = devm_kzalloc(dev, VSC_TP_MAX_XFER_SIZE, GFP_KERNEL); + tp->tx_buf = devm_kzalloc(dev, sizeof(*tp->tx_buf), GFP_KERNEL); if (!tp->tx_buf) return -ENOMEM; - tp->rx_buf = devm_kzalloc(dev, VSC_TP_MAX_XFER_SIZE, GFP_KERNEL); + tp->rx_buf = devm_kzalloc(dev, sizeof(*tp->rx_buf), GFP_KERNEL); if (!tp->rx_buf) return -ENOMEM; -- cgit v1.2.3 From 4d239f447f96bd2cb646f89431e9db186c1ccfd4 Mon Sep 17 00:00:00 2001 From: Mahesh Rao Date: Wed, 26 Mar 2025 06:54:46 -0500 Subject: firmware: stratix10-svc: Add of_platform_default_populate() Add of_platform_default_populate() to stratix10-svc driver as the firmware/svc node was moved out of soc. This fixes the failed probing of child drivers of svc node. Cc: stable@vger.kernel.org Fixes: 23c3ebed382a ("arm64: dts: socfpga: agilex: move firmware out of soc node") Reviewed-by: Krzysztof Kozlowski Reviewed-by: Xu Yilun Signed-off-by: Mahesh Rao Signed-off-by: Dinh Nguyen Link: https://lore.kernel.org/r/20250326115446.36123-1-dinguyen@kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/stratix10-svc.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/firmware/stratix10-svc.c b/drivers/firmware/stratix10-svc.c index 3c52cb73237a..e3f990d888d7 100644 --- a/drivers/firmware/stratix10-svc.c +++ b/drivers/firmware/stratix10-svc.c @@ -1224,22 +1224,28 @@ static int stratix10_svc_drv_probe(struct platform_device *pdev) if (!svc->intel_svc_fcs) { dev_err(dev, "failed to allocate %s device\n", INTEL_FCS); ret = -ENOMEM; - goto err_unregister_dev; + goto err_unregister_rsu_dev; } ret = platform_device_add(svc->intel_svc_fcs); if (ret) { platform_device_put(svc->intel_svc_fcs); - goto err_unregister_dev; + goto err_unregister_rsu_dev; } + ret = of_platform_default_populate(dev_of_node(dev), NULL, dev); + if (ret) + goto err_unregister_fcs_dev; + dev_set_drvdata(dev, svc); pr_info("Intel Service Layer Driver Initialized\n"); return 0; -err_unregister_dev: +err_unregister_fcs_dev: + platform_device_unregister(svc->intel_svc_fcs); +err_unregister_rsu_dev: platform_device_unregister(svc->stratix10_svc_rsu); err_free_kfifo: kfifo_free(&controller->svc_fifo); @@ -1253,6 +1259,8 @@ static void stratix10_svc_drv_remove(struct platform_device *pdev) struct stratix10_svc *svc = dev_get_drvdata(&pdev->dev); struct stratix10_svc_controller *ctrl = platform_get_drvdata(pdev); + of_platform_depopulate(ctrl->dev); + platform_device_unregister(svc->intel_svc_fcs); platform_device_unregister(svc->stratix10_svc_rsu); -- cgit v1.2.3 From 1c4494c14b4124f3a13a7f4912b84b633ff4f9ba Mon Sep 17 00:00:00 2001 From: Miguel Ojeda Date: Mon, 14 Apr 2025 19:12:41 +0200 Subject: rust: kbuild: use `pound` to support GNU Make < 4.3 GNU Make 4.3 changed the behavior of `#` inside commands in commit c6966b323811 ("[SV 20513] Un-escaped # are not comments in function invocations"): * WARNING: Backward-incompatibility! Number signs (#) appearing inside a macro reference or function invocation no longer introduce comments and should not be escaped with backslashes: thus a call such as: foo := $(shell echo '#') is legal. Previously the number sign needed to be escaped, for example: foo := $(shell echo '\#') Now this latter will resolve to "\#". If you want to write makefiles portable to both versions, assign the number sign to a variable: H := \# foo := $(shell echo '$H') This was claimed to be fixed in 3.81, but wasn't, for some reason. To detect this change search for 'nocomment' in the .FEATURES variable. Unlike other commits in the kernel about this issue, such as commit 633174a7046e ("lib/raid6/test/Makefile: Use $(pound) instead of \# for Make 4.3"), that fixed the issue for newer GNU Makes, in our case it was the opposite, i.e. we need to fix it for the older ones: someone building with e.g. 4.2.1 gets the following error: scripts/Makefile.compiler:81: *** unterminated call to function 'call': missing ')'. Stop. Thus use the existing variable to fix it. Reported-by: moyi geek <1441339168@qq.com> Closes: https://rust-for-linux.zulipchat.com/#narrow/channel/291565/topic/x/near/512001985 Cc: stable@vger.kernel.org Fixes: e72a076c620f ("kbuild: fix issues with rustc-option") Reviewed-by: Nicolas Schier Reviewed-by: Alice Ryhl Link: https://lore.kernel.org/r/20250414171241.2126137-1-ojeda@kernel.org Signed-off-by: Miguel Ojeda --- scripts/Makefile.compiler | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/Makefile.compiler b/scripts/Makefile.compiler index 7ed7f92a7daa..f4fcc1eaaeae 100644 --- a/scripts/Makefile.compiler +++ b/scripts/Makefile.compiler @@ -79,7 +79,7 @@ ld-option = $(call try-run, $(LD) $(KBUILD_LDFLAGS) $(1) -v,$(1),$(2),$(3)) # Usage: MY_RUSTFLAGS += $(call __rustc-option,$(RUSTC),$(MY_RUSTFLAGS),-Cinstrument-coverage,-Zinstrument-coverage) # TODO: remove RUSTC_BOOTSTRAP=1 when we raise the minimum GNU Make version to 4.4 __rustc-option = $(call try-run,\ - echo '#![allow(missing_docs)]#![feature(no_core)]#![no_core]' | RUSTC_BOOTSTRAP=1\ + echo '$(pound)![allow(missing_docs)]$(pound)![feature(no_core)]$(pound)![no_core]' | RUSTC_BOOTSTRAP=1\ $(1) --sysroot=/dev/null $(filter-out --sysroot=/dev/null --target=%,$(2)) $(3)\ --crate-type=rlib --out-dir=$(TMPOUT) --emit=obj=- - >/dev/null,$(3),$(4)) -- cgit v1.2.3 From 584e61452f75bfeac2cdd83730b4059526ec60c7 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 12 Apr 2025 09:53:41 +0900 Subject: rust: helpers: Remove volatile qualifier from io helpers Remove the `volatile` qualifier used with __iomem in helper functions in io.c. These helper functions are just wrappers around the corresponding accessors so they are unnecessary. This fixes the following UML build error with CONFIG_RUST enabled: In file included from rust/helpers/helpers.c:19: rust/helpers/io.c:12:10: error: passing 'volatile void *' to parameter of type 'void *' discards qualifiers [-Werror,-Wincompatible-pointer-types-discards-qualifiers] 12 | iounmap(addr); | ^~~~ arch/um/include/asm/io.h:19:42: note: passing argument to parameter 'addr' here 19 | static inline void iounmap(void __iomem *addr) | ^ 1 error generated. [ Arnd explains [1] that removing the qualifier is the way forward (thanks!): Rihgt, I tried this last week when it came up first, removing the 'volatile' annotations in the asm-generic/io.h header and then all the ones that caused build regressions on arm/arm64/x86 randconfig and allmodconfig builds. This patch is a little longer than my original version as I did run into a few regressions later. As far as I can tell, none of these volatile annotations have any actual effect, and most of them date back to ancient kernels where this may have been required. Leaving it out of the rust interface is clearly the right way, and it shouldn't be too hard to upstream the changes below when we need to, but I also don't see any priority to send these. If anyone wants to help out, I can send them the whole patch. I created an issue [2] in case someone wants to help. - Miguel ] Fixes: ce30d94e6855 ("rust: add `io::{Io, IoRaw}` base types") Signed-off-by: FUJITA Tomonori Cc: stable@vger.kernel.org Reviewed-by: Danilo Krummrich Link: https://lore.kernel.org/rust-for-linux/0c844b70-19c7-4b14-ba29-fc99ae0d69f0@app.fastmail.com/ [1] Link: https://github.com/Rust-for-Linux/linux/issues/1156 [2] Link: https://lore.kernel.org/r/20250412005341.157150-1-fujita.tomonori@gmail.com [ Reworded for relative paths. - Miguel ] Signed-off-by: Miguel Ojeda --- rust/helpers/io.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/rust/helpers/io.c b/rust/helpers/io.c index 4c2401ccd720..15ea187c5466 100644 --- a/rust/helpers/io.c +++ b/rust/helpers/io.c @@ -7,94 +7,94 @@ void __iomem *rust_helper_ioremap(phys_addr_t offset, size_t size) return ioremap(offset, size); } -void rust_helper_iounmap(volatile void __iomem *addr) +void rust_helper_iounmap(void __iomem *addr) { iounmap(addr); } -u8 rust_helper_readb(const volatile void __iomem *addr) +u8 rust_helper_readb(const void __iomem *addr) { return readb(addr); } -u16 rust_helper_readw(const volatile void __iomem *addr) +u16 rust_helper_readw(const void __iomem *addr) { return readw(addr); } -u32 rust_helper_readl(const volatile void __iomem *addr) +u32 rust_helper_readl(const void __iomem *addr) { return readl(addr); } #ifdef CONFIG_64BIT -u64 rust_helper_readq(const volatile void __iomem *addr) +u64 rust_helper_readq(const void __iomem *addr) { return readq(addr); } #endif -void rust_helper_writeb(u8 value, volatile void __iomem *addr) +void rust_helper_writeb(u8 value, void __iomem *addr) { writeb(value, addr); } -void rust_helper_writew(u16 value, volatile void __iomem *addr) +void rust_helper_writew(u16 value, void __iomem *addr) { writew(value, addr); } -void rust_helper_writel(u32 value, volatile void __iomem *addr) +void rust_helper_writel(u32 value, void __iomem *addr) { writel(value, addr); } #ifdef CONFIG_64BIT -void rust_helper_writeq(u64 value, volatile void __iomem *addr) +void rust_helper_writeq(u64 value, void __iomem *addr) { writeq(value, addr); } #endif -u8 rust_helper_readb_relaxed(const volatile void __iomem *addr) +u8 rust_helper_readb_relaxed(const void __iomem *addr) { return readb_relaxed(addr); } -u16 rust_helper_readw_relaxed(const volatile void __iomem *addr) +u16 rust_helper_readw_relaxed(const void __iomem *addr) { return readw_relaxed(addr); } -u32 rust_helper_readl_relaxed(const volatile void __iomem *addr) +u32 rust_helper_readl_relaxed(const void __iomem *addr) { return readl_relaxed(addr); } #ifdef CONFIG_64BIT -u64 rust_helper_readq_relaxed(const volatile void __iomem *addr) +u64 rust_helper_readq_relaxed(const void __iomem *addr) { return readq_relaxed(addr); } #endif -void rust_helper_writeb_relaxed(u8 value, volatile void __iomem *addr) +void rust_helper_writeb_relaxed(u8 value, void __iomem *addr) { writeb_relaxed(value, addr); } -void rust_helper_writew_relaxed(u16 value, volatile void __iomem *addr) +void rust_helper_writew_relaxed(u16 value, void __iomem *addr) { writew_relaxed(value, addr); } -void rust_helper_writel_relaxed(u32 value, volatile void __iomem *addr) +void rust_helper_writel_relaxed(u32 value, void __iomem *addr) { writel_relaxed(value, addr); } #ifdef CONFIG_64BIT -void rust_helper_writeq_relaxed(u64 value, volatile void __iomem *addr) +void rust_helper_writeq_relaxed(u64 value, void __iomem *addr) { writeq_relaxed(value, addr); } -- cgit v1.2.3 From c1b4071ec3a6a594df6c49bf8f04a60a88072525 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Sat, 12 Apr 2025 09:05:06 +0900 Subject: rust: helpers: Add dma_alloc_attrs() and dma_free_attrs() Add dma_alloc_attrs() and dma_free_attrs() helpers to fix a build error when CONFIG_HAS_DMA is not enabled. Note that when CONFIG_HAS_DMA is enabled, dma_alloc_attrs() and dma_free_attrs() are included in both bindings_generated.rs and bindings_helpers_generated.rs. The former takes precedence so behavior remains unchanged in that case. This fixes the following build error on UML: error[E0425]: cannot find function `dma_alloc_attrs` in crate `bindings` --> rust/kernel/dma.rs:171:23 | 171 | bindings::dma_alloc_attrs( | ^^^^^^^^^^^^^^^ help: a function with a similar name exists: `dma_alloc_pages` | ::: rust/bindings/bindings_generated.rs:44568:5 | 44568 | / pub fn dma_alloc_pages( 44569 | | dev: *mut device, 44570 | | size: usize, 44571 | | dma_handle: *mut dma_addr_t, 44572 | | dir: dma_data_direction, 44573 | | gfp: gfp_t, 44574 | | ) -> *mut page; | |___________________- similarly named function `dma_alloc_pages` defined here error[E0425]: cannot find function `dma_free_attrs` in crate `bindings` --> rust/kernel/dma.rs:293:23 | 293 | bindings::dma_free_attrs( | ^^^^^^^^^^^^^^ help: a function with a similar name exists: `dma_free_pages` | ::: rust/bindings/bindings_generated.rs:44577:5 | 44577 | / pub fn dma_free_pages( 44578 | | dev: *mut device, 44579 | | size: usize, 44580 | | page: *mut page, 44581 | | dma_handle: dma_addr_t, 44582 | | dir: dma_data_direction, 44583 | | ); | |______- similarly named function `dma_free_pages` defined here Fixes: ad2907b4e308 ("rust: add dma coherent allocator abstraction") Signed-off-by: FUJITA Tomonori Acked-by: Danilo Krummrich Link: https://lore.kernel.org/r/20250412000507.157000-1-fujita.tomonori@gmail.com [ Reworded for relative paths. - Miguel ] Signed-off-by: Miguel Ojeda --- MAINTAINERS | 1 + rust/helpers/dma.c | 16 ++++++++++++++++ rust/helpers/helpers.c | 1 + 3 files changed, 18 insertions(+) create mode 100644 rust/helpers/dma.c diff --git a/MAINTAINERS b/MAINTAINERS index 96b827049501..bec614ef35d9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7020,6 +7020,7 @@ L: rust-for-linux@vger.kernel.org S: Supported W: https://rust-for-linux.com T: git https://github.com/Rust-for-Linux/linux.git alloc-next +F: rust/helpers/dma.c F: rust/kernel/dma.rs F: samples/rust/rust_dma.rs diff --git a/rust/helpers/dma.c b/rust/helpers/dma.c new file mode 100644 index 000000000000..df8b8a77355a --- /dev/null +++ b/rust/helpers/dma.c @@ -0,0 +1,16 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include + +void *rust_helper_dma_alloc_attrs(struct device *dev, size_t size, + dma_addr_t *dma_handle, gfp_t flag, + unsigned long attrs) +{ + return dma_alloc_attrs(dev, size, dma_handle, flag, attrs); +} + +void rust_helper_dma_free_attrs(struct device *dev, size_t size, void *cpu_addr, + dma_addr_t dma_handle, unsigned long attrs) +{ + dma_free_attrs(dev, size, cpu_addr, dma_handle, attrs); +} diff --git a/rust/helpers/helpers.c b/rust/helpers/helpers.c index e1c21eba9b15..1e7c84df7252 100644 --- a/rust/helpers/helpers.c +++ b/rust/helpers/helpers.c @@ -14,6 +14,7 @@ #include "cpumask.c" #include "cred.c" #include "device.c" +#include "dma.c" #include "err.c" #include "fs.c" #include "io.c" -- cgit v1.2.3 From 968e1cbb1f6293c3add9607f80b5ce3d29f57583 Mon Sep 17 00:00:00 2001 From: Adam Xue Date: Mon, 14 Apr 2025 14:14:37 -0700 Subject: USB: serial: option: add Sierra Wireless EM9291 Add Sierra Wireless EM9291. Interface 0: MBIM control 1: MBIM data 3: AT port 4: Diagnostic port T: Bus=01 Lev=01 Prnt=01 Port=00 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=00(>ifc ) Sub=00 Prot=00 MxPS=64 #Cfgs= 1 P: Vendor=1199 ProdID=90e3 Rev=00.06 S: Manufacturer=Sierra Wireless, Incorporated S: Product=Sierra Wireless EM9291 S: SerialNumber=xxxxxxxxxxxxxxxx C: #Ifs= 4 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim E: Ad=81(I) Atr=03(Int.) MxPS= 64 Ivl=32ms I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim E: Ad=0f(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=8e(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 3 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=ff Prot=40 Driver=(none) E: Ad=01(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=83(I) Atr=03(Int.) MxPS= 10 Ivl=32ms I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=ff Prot=30 Driver=(none) E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=84(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms Signed-off-by: Adam Xue Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/option.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 5cd26dac2069..27879cc57536 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -611,6 +611,7 @@ static void option_instat_callback(struct urb *urb); /* Sierra Wireless products */ #define SIERRA_VENDOR_ID 0x1199 #define SIERRA_PRODUCT_EM9191 0x90d3 +#define SIERRA_PRODUCT_EM9291 0x90e3 /* UNISOC (Spreadtrum) products */ #define UNISOC_VENDOR_ID 0x1782 @@ -2432,6 +2433,8 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x30) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9191, 0xff, 0, 0) }, + { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9291, 0xff, 0xff, 0x30) }, + { USB_DEVICE_AND_INTERFACE_INFO(SIERRA_VENDOR_ID, SIERRA_PRODUCT_EM9291, 0xff, 0xff, 0x40) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, TOZED_PRODUCT_LT70C, 0xff, 0, 0) }, { USB_DEVICE_AND_INTERFACE_INFO(UNISOC_VENDOR_ID, LUAT_PRODUCT_AIR720U, 0xff, 0, 0) }, { USB_DEVICE_INTERFACE_CLASS(0x1bbb, 0x0530, 0xff), /* TCL IK512 MBIM */ -- cgit v1.2.3 From b399078f882b6e5d32da18b6c696cc84b12f90d5 Mon Sep 17 00:00:00 2001 From: Michael Ehrenreich Date: Mon, 17 Mar 2025 06:17:15 +0100 Subject: USB: serial: ftdi_sio: add support for Abacus Electrics Optical Probe Abacus Electrics makes optical probes for interacting with smart meters over an optical interface. At least one version uses an FT232B chip (as detected by ftdi_sio) with a custom USB PID, which needs to be added to the list to make the device work in a plug-and-play fashion. Signed-off-by: Michael Ehrenreich Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/ftdi_sio.c | 2 ++ drivers/usb/serial/ftdi_sio_ids.h | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index 9b34e23b7091..6ac7a0a5cf07 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1093,6 +1093,8 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE_INTERFACE_NUMBER(ALTERA_VID, ALTERA_UB3_602E_PID, 1) }, { USB_DEVICE_INTERFACE_NUMBER(ALTERA_VID, ALTERA_UB3_602E_PID, 2) }, { USB_DEVICE_INTERFACE_NUMBER(ALTERA_VID, ALTERA_UB3_602E_PID, 3) }, + /* Abacus Electrics */ + { USB_DEVICE(FTDI_VID, ABACUS_OPTICAL_PROBE_PID) }, { } /* Terminating entry */ }; diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 52be47d684ea..9acb6f837327 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -442,6 +442,11 @@ #define LINX_FUTURE_1_PID 0xF44B /* Linx future device */ #define LINX_FUTURE_2_PID 0xF44C /* Linx future device */ +/* + * Abacus Electrics + */ +#define ABACUS_OPTICAL_PROBE_PID 0xf458 /* ABACUS ELECTRICS Optical Probe */ + /* * Oceanic product ids */ -- cgit v1.2.3 From 4cc01410e1c1dd075df10f750775c81d1cb6672b Mon Sep 17 00:00:00 2001 From: Craig Hesling Date: Tue, 8 Apr 2025 16:27:03 -0700 Subject: USB: serial: simple: add OWON HDS200 series oscilloscope support Add serial support for OWON HDS200 series oscilloscopes and likely many other pieces of OWON test equipment. OWON HDS200 series devices host two USB endpoints, designed to facilitate bidirectional SCPI. SCPI is a predominately ASCII text protocol for test/measurement equipment. Having a serial/tty interface for these devices lowers the barrier to entry for anyone trying to write programs to communicate with them. The following shows the USB descriptor for the OWON HDS272S running firmware V5.7.1: Bus 001 Device 068: ID 5345:1234 Owon PDS6062T Oscilloscope Negotiated speed: Full Speed (12Mbps) Device Descriptor: bLength 18 bDescriptorType 1 bcdUSB 2.00 bDeviceClass 0 [unknown] bDeviceSubClass 0 [unknown] bDeviceProtocol 0 bMaxPacketSize0 64 idVendor 0x5345 Owon idProduct 0x1234 PDS6062T Oscilloscope bcdDevice 1.00 iManufacturer 1 oscilloscope iProduct 2 oscilloscope iSerial 3 oscilloscope bNumConfigurations 1 Configuration Descriptor: bLength 9 bDescriptorType 2 wTotalLength 0x0029 bNumInterfaces 1 bConfigurationValue 1 iConfiguration 0 bmAttributes 0x80 (Bus Powered) MaxPower 100mA Interface Descriptor: bLength 9 bDescriptorType 4 bInterfaceNumber 0 bAlternateSetting 0 bNumEndpoints 2 bInterfaceClass 5 Physical Interface Device bInterfaceSubClass 0 [unknown] bInterfaceProtocol 0 iInterface 0 ** UNRECOGNIZED: 09 21 11 01 00 01 22 5f 00 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x81 EP 1 IN bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0040 1x 64 bytes bInterval 32 Endpoint Descriptor: bLength 7 bDescriptorType 5 bEndpointAddress 0x01 EP 1 OUT bmAttributes 2 Transfer Type Bulk Synch Type None Usage Type Data wMaxPacketSize 0x0040 1x 64 bytes bInterval 32 Device Status: 0x0000 (Bus Powered) OWON appears to be using the same USB Vendor and Product ID for many of their oscilloscopes. Looking at the discussion about the USB vendor/product ID, in the link bellow, suggests that this VID/PID is shared with VDS, SDS, PDS, and now the HDS series oscilloscopes. Available documentation for these devices seems to indicate that all use a similar SCPI protocol, some with RS232 options. It is likely that this same simple serial setup would work correctly for them all. Link: https://usb-ids.gowdy.us/read/UD/5345/1234 Signed-off-by: Craig Hesling Cc: stable@vger.kernel.org Signed-off-by: Johan Hovold --- drivers/usb/serial/usb-serial-simple.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/serial/usb-serial-simple.c b/drivers/usb/serial/usb-serial-simple.c index 2c12449ff60c..a0afaf254d12 100644 --- a/drivers/usb/serial/usb-serial-simple.c +++ b/drivers/usb/serial/usb-serial-simple.c @@ -100,6 +100,11 @@ DEVICE(nokia, NOKIA_IDS); { USB_DEVICE(0x09d7, 0x0100) } /* NovAtel FlexPack GPS */ DEVICE_N(novatel_gps, NOVATEL_IDS, 3); +/* OWON electronic test and measurement equipment driver */ +#define OWON_IDS() \ + { USB_DEVICE(0x5345, 0x1234) } /* HDS200 oscilloscopes and others */ +DEVICE(owon, OWON_IDS); + /* Siemens USB/MPI adapter */ #define SIEMENS_IDS() \ { USB_DEVICE(0x908, 0x0004) } @@ -134,6 +139,7 @@ static struct usb_serial_driver * const serial_drivers[] = { &motorola_tetra_device, &nokia_device, &novatel_gps_device, + &owon_device, &siemens_mpi_device, &suunto_device, &vivopay_device, @@ -153,6 +159,7 @@ static const struct usb_device_id id_table[] = { MOTOROLA_TETRA_IDS(), NOKIA_IDS(), NOVATEL_IDS(), + OWON_IDS(), SIEMENS_IDS(), SUUNTO_IDS(), VIVOPAY_IDS(), -- cgit v1.2.3 From a681b7c17dd21d5aa0da391ceb27a2007ba970a4 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 15 Apr 2025 12:01:08 +0200 Subject: fs: ensure that *path_locked*() helpers leave passed path pristine The functions currently leaving dangling pointers in the passed-in path leading to hard to debug bugs in the long run. Ensure that the path is left in pristine state just like we do in e.g., path_parentat() and other helpers. Link: https://lore.kernel.org/20250414-rennt-wimmeln-f186c3a780f1@brauner Signed-off-by: Christian Brauner --- fs/namei.c | 34 ++++++++++++++++++---------------- 1 file changed, 18 insertions(+), 16 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 267ac9c22dd2..84a0e0b0111c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2752,46 +2752,48 @@ static int filename_parentat(int dfd, struct filename *name, /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { + struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(dfd, name, 0, path, &last, &type); + error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); - if (unlikely(type != LAST_NORM)) { - path_put(path); + if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, path->dentry, 0); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); if (IS_ERR(d)) { - inode_unlock(path->dentry->d_inode); - path_put(path); + inode_unlock(parent_path.dentry->d_inode); + return d; } + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); return d; } struct dentry *kern_path_locked_negative(const char *name, struct path *path) { + struct path parent_path __free(path_put) = {}; struct filename *filename __free(putname) = getname_kernel(name); struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(AT_FDCWD, filename, 0, path, &last, &type); + error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); - if (unlikely(type != LAST_NORM)) { - path_put(path); + if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); - } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl_raw(&last, path->dentry, 0); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0); if (IS_ERR(d)) { - inode_unlock(path->dentry->d_inode); - path_put(path); + inode_unlock(parent_path.dentry->d_inode); + return d; } + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); return d; } -- cgit v1.2.3 From 2b8e6b58889c672e1ae3601d9b2b070be4dc2fbc Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Sun, 13 Apr 2025 11:11:42 +0100 Subject: cpufreq: cppc: Fix invalid return value in .get() callback Returning a negative error code in a function with an unsigned return type is a pretty bad idea. It is probably worse when the justification for the change is "our static analisys tool found it". Fixes: cf7de25878a1 ("cppc_cpufreq: Fix possible null pointer dereference") Signed-off-by: Marc Zyngier Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Reviewed-by: Lifeng Zheng Signed-off-by: Viresh Kumar --- drivers/cpufreq/cppc_cpufreq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c index b3d74f9adcf0..cb93f00bafdb 100644 --- a/drivers/cpufreq/cppc_cpufreq.c +++ b/drivers/cpufreq/cppc_cpufreq.c @@ -747,7 +747,7 @@ static unsigned int cppc_cpufreq_get_rate(unsigned int cpu) int ret; if (!policy) - return -ENODEV; + return 0; cpu_data = policy->driver_data; -- cgit v1.2.3 From 75caec0c2aa3a7ec84348d438c74cb8a2eb4de97 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 31 Mar 2025 10:16:46 +0300 Subject: i2c: atr: Fix wrong include The fwnode.h is not supposed to be used by the drivers as it has the definitions for the core parts for different device property provider implementations. Drop it. Note, that fwnode API for drivers is provided in property.h which is included here. Fixes: a076a860acae ("media: i2c: add I2C Address Translator (ATR) support") Signed-off-by: Andy Shevchenko Acked-by: Mukesh Kumar Savaliya Reviewed-by: Luca Ceresoli Reviewed-by: Tomi Valkeinen [wsa: reworded subject] Signed-off-by: Wolfram Sang --- drivers/i2c/i2c-atr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/i2c-atr.c b/drivers/i2c/i2c-atr.c index 8fe9ddff8e96..783fb8df2ebe 100644 --- a/drivers/i2c/i2c-atr.c +++ b/drivers/i2c/i2c-atr.c @@ -8,12 +8,12 @@ * Originally based on i2c-mux.c */ -#include #include #include #include #include #include +#include #include #include -- cgit v1.2.3 From 447fab30955cf7dba7dd563f42b67c02284860c8 Mon Sep 17 00:00:00 2001 From: Christian König Date: Fri, 28 Mar 2025 18:58:17 +0100 Subject: drm/amdgpu: use a dummy owner for sysfs triggered cleaner shaders v4 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Otherwise triggering sysfs multiple times without other submissions in between only runs the shader once. v2: add some comment v3: re-add missing cast v4: squash in semicolon fix Signed-off-by: Christian König Reviewed-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher (cherry picked from commit 8b2ae7d492675e8af8902f103364bef59382b935) --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 72af5e5a894a..cf2df7790077 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1438,9 +1438,11 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring) struct amdgpu_device *adev = ring->adev; struct drm_gpu_scheduler *sched = &ring->sched; struct drm_sched_entity entity; + static atomic_t counter; struct dma_fence *f; struct amdgpu_job *job; struct amdgpu_ib *ib; + void *owner; int i, r; /* Initialize the scheduler entity */ @@ -1451,9 +1453,15 @@ static int amdgpu_gfx_run_cleaner_shader_job(struct amdgpu_ring *ring) goto err; } - r = amdgpu_job_alloc_with_ib(ring->adev, &entity, NULL, - 64, 0, - &job); + /* + * Use some unique dummy value as the owner to make sure we execute + * the cleaner shader on each submission. The value just need to change + * for each submission and is otherwise meaningless. + */ + owner = (void *)(unsigned long)atomic_inc_return(&counter); + + r = amdgpu_job_alloc_with_ib(ring->adev, &entity, owner, + 64, 0, &job); if (r) goto err; -- cgit v1.2.3 From 1657793def101dac7c9d3b2250391f6a3dd934ba Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Tue, 8 Apr 2025 13:09:57 -0500 Subject: drm/amd: Forbid suspending into non-default suspend states On systems that default to 'deep' some userspace software likes to try to suspend in 'deep' first. If there is a failure for any reason (such as -ENOMEM) the failure is ignored and then it will try to use 's2idle' as a fallback. This fails, but more importantly it leads to graphical problems. Forbid this behavior and only allow suspending in the last state supported by the system. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4093 Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250408180957.4027643-1-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 2aabd44aa8a3c08da3d43264c168370f6da5e81d) --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 2c04ae133848..ef6e78224fdf 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1123,6 +1123,7 @@ struct amdgpu_device { bool in_s3; bool in_s4; bool in_s0ix; + suspend_state_t last_suspend_state; enum pp_mp1_state mp1_state; struct amdgpu_doorbell_index doorbell_index; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 26bf896f1444..24ee4710f807 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2548,8 +2548,20 @@ static int amdgpu_pmops_suspend(struct device *dev) adev->in_s0ix = true; else if (amdgpu_acpi_is_s3_active(adev)) adev->in_s3 = true; - if (!adev->in_s0ix && !adev->in_s3) + if (!adev->in_s0ix && !adev->in_s3) { + /* don't allow going deep first time followed by s2idle the next time */ + if (adev->last_suspend_state != PM_SUSPEND_ON && + adev->last_suspend_state != pm_suspend_target_state) { + drm_err_once(drm_dev, "Unsupported suspend state %d\n", + pm_suspend_target_state); + return -EINVAL; + } return 0; + } + + /* cache the state last used for suspend */ + adev->last_suspend_state = pm_suspend_target_state; + return amdgpu_device_suspend(drm_dev, true); } -- cgit v1.2.3 From e7afa85a0d0eba5bf2c0a446ff622ebdbc9812d6 Mon Sep 17 00:00:00 2001 From: ZhenGuo Yin Date: Tue, 8 Apr 2025 16:18:28 +0800 Subject: drm/amdgpu: fix warning of drm_mm_clean MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel doorbell BOs needs to be freed before ttm_fini. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4145 Fixes: 54c30d2a8def ("drm/amdgpu: create kernel doorbell pages") Acked-by: Alex Deucher Reviewed-by: Christian König Signed-off-by: ZhenGuo Yin Signed-off-by: Alex Deucher (cherry picked from commit 39938a8ed979e398faa3791a47e282c82bcc6f04) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index b34b915203f2..7f354cd532dc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -3510,6 +3510,7 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) amdgpu_device_mem_scratch_fini(adev); amdgpu_ib_pool_fini(adev); amdgpu_seq64_fini(adev); + amdgpu_doorbell_fini(adev); } if (adev->ip_blocks[i].version->funcs->sw_fini) { r = adev->ip_blocks[i].version->funcs->sw_fini(&adev->ip_blocks[i]); @@ -4858,7 +4859,6 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; - amdgpu_doorbell_fini(adev); drm_dev_exit(idx); } -- cgit v1.2.3 From 2036be31741b00f030530381643a8b35a5a42b5c Mon Sep 17 00:00:00 2001 From: David Rosca Date: Mon, 7 Apr 2025 13:12:11 +0200 Subject: drm/amdgpu: Add back JPEG to video caps for carrizo and newer JPEG is not supported on Vega only. Fixes: 0a6e7b06bdbe ("drm/amdgpu: Remove JPEG from vega and carrizo video caps") Signed-off-by: David Rosca Reviewed-by: Leo Liu Signed-off-by: Alex Deucher (cherry picked from commit 0f4dfe86fe922c37bcec99dce80a15b4d5d4726d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/amdgpu/vi.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/vi.c b/drivers/gpu/drm/amd/amdgpu/vi.c index 86d8bc10d90a..9b3510e53112 100644 --- a/drivers/gpu/drm/amd/amdgpu/vi.c +++ b/drivers/gpu/drm/amd/amdgpu/vi.c @@ -239,6 +239,13 @@ static const struct amdgpu_video_codec_info cz_video_codecs_decode_array[] = .max_pixels_per_frame = 4096 * 4096, .max_level = 186, }, + { + .codec_type = AMDGPU_INFO_VIDEO_CAPS_CODEC_IDX_JPEG, + .max_width = 4096, + .max_height = 4096, + .max_pixels_per_frame = 4096 * 4096, + .max_level = 0, + }, }; static const struct amdgpu_video_codecs cz_video_codecs_decode = -- cgit v1.2.3 From cd9e6d6fdd2de60bfb4672387c17d4ee7157cf8e Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 8 Apr 2025 21:27:15 -0400 Subject: drm/amd/display/dml2: use vzalloc rather than kzalloc The structures are large and they do not require contiguous memory so use vzalloc. Fixes: 70839da63605 ("drm/amd/display: Add new DCN401 sources") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4126 Cc: Aurabindo Pillai Reviewed-by: Aurabindo Pillai Signed-off-by: Alex Deucher (cherry picked from commit 20c50a9a793300a1fc82f3ddd0e3c68f8213fbef) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_wrapper.c | 11 ++++++----- drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c | 6 ++++-- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_wrapper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_wrapper.c index 94e99e540691..5d16f36ec95c 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_wrapper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml21/dml21_wrapper.c @@ -2,6 +2,7 @@ // // Copyright 2024 Advanced Micro Devices, Inc. +#include #include "dml2_internal_types.h" #include "dml_top.h" @@ -13,11 +14,11 @@ static bool dml21_allocate_memory(struct dml2_context **dml_ctx) { - *dml_ctx = kzalloc(sizeof(struct dml2_context), GFP_KERNEL); + *dml_ctx = vzalloc(sizeof(struct dml2_context)); if (!(*dml_ctx)) return false; - (*dml_ctx)->v21.dml_init.dml2_instance = kzalloc(sizeof(struct dml2_instance), GFP_KERNEL); + (*dml_ctx)->v21.dml_init.dml2_instance = vzalloc(sizeof(struct dml2_instance)); if (!((*dml_ctx)->v21.dml_init.dml2_instance)) return false; @@ -27,7 +28,7 @@ static bool dml21_allocate_memory(struct dml2_context **dml_ctx) (*dml_ctx)->v21.mode_support.display_config = &(*dml_ctx)->v21.display_config; (*dml_ctx)->v21.mode_programming.display_config = (*dml_ctx)->v21.mode_support.display_config; - (*dml_ctx)->v21.mode_programming.programming = kzalloc(sizeof(struct dml2_display_cfg_programming), GFP_KERNEL); + (*dml_ctx)->v21.mode_programming.programming = vzalloc(sizeof(struct dml2_display_cfg_programming)); if (!((*dml_ctx)->v21.mode_programming.programming)) return false; @@ -115,8 +116,8 @@ bool dml21_create(const struct dc *in_dc, struct dml2_context **dml_ctx, const s void dml21_destroy(struct dml2_context *dml2) { - kfree(dml2->v21.dml_init.dml2_instance); - kfree(dml2->v21.mode_programming.programming); + vfree(dml2->v21.dml_init.dml2_instance); + vfree(dml2->v21.mode_programming.programming); } static void dml21_calculate_rq_and_dlg_params(const struct dc *dc, struct dc_state *context, struct resource_context *out_new_hw_state, diff --git a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c index f549a778f6f1..e89571874185 100644 --- a/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c +++ b/drivers/gpu/drm/amd/display/dc/dml2/dml2_wrapper.c @@ -24,6 +24,8 @@ * */ +#include + #include "display_mode_core.h" #include "dml2_internal_types.h" #include "dml2_utils.h" @@ -747,7 +749,7 @@ bool dml2_validate(const struct dc *in_dc, struct dc_state *context, struct dml2 static inline struct dml2_context *dml2_allocate_memory(void) { - return (struct dml2_context *) kzalloc(sizeof(struct dml2_context), GFP_KERNEL); + return (struct dml2_context *) vzalloc(sizeof(struct dml2_context)); } static void dml2_init(const struct dc *in_dc, const struct dml2_configuration_options *config, struct dml2_context **dml2) @@ -821,7 +823,7 @@ void dml2_destroy(struct dml2_context *dml2) if (dml2->architecture == dml2_architecture_21) dml21_destroy(dml2); - kfree(dml2); + vfree(dml2); } void dml2_extract_dram_and_fclk_change_support(struct dml2_context *dml2, -- cgit v1.2.3 From c235a7132258ac30bd43d228222986022d21f5de Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Fri, 11 Apr 2025 17:40:26 +0530 Subject: drm/amdgpu: Use the right function for hdp flush There are a few prechecks made before HDP flush like a flush is not required on APU bare metal. Using hdp callback directly bypasses those checks. Use amdgpu_device_flush_hdp which takes care of prechecks. Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher (cherry picked from commit 1d9bff4cf8c53d33ee2ff1b11574e5da739ce61c) --- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 8 ++++---- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 12 ++++++------ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 6 +++--- drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/psp_v11_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/psp_v13_0.c | 2 +- drivers/gpu/drm/amd/amdgpu/psp_v14_0.c | 2 +- 10 files changed, 23 insertions(+), 23 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index a63ce747863f..23e6a05359c2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -6114,7 +6114,7 @@ static int gfx_v10_0_cp_gfx_load_pfp_microcode(struct amdgpu_device *adev) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, mmCP_PFP_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_PFP_IC_BASE_CNTL, VMID, 0); @@ -6192,7 +6192,7 @@ static int gfx_v10_0_cp_gfx_load_ce_microcode(struct amdgpu_device *adev) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, mmCP_CE_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_CE_IC_BASE_CNTL, VMID, 0); @@ -6269,7 +6269,7 @@ static int gfx_v10_0_cp_gfx_load_me_microcode(struct amdgpu_device *adev) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, mmCP_ME_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_ME_IC_BASE_CNTL, VMID, 0); @@ -6644,7 +6644,7 @@ static int gfx_v10_0_cp_compute_load_microcode(struct amdgpu_device *adev) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, mmCP_CPC_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, CACHE_POLICY, 0); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index d57db42f9536..2a5c2a1ae3c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -2428,7 +2428,7 @@ static int gfx_v11_0_config_me_cache(struct amdgpu_device *adev, uint64_t addr) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, regCP_ME_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_ME_IC_BASE_CNTL, VMID, 0); @@ -2472,7 +2472,7 @@ static int gfx_v11_0_config_pfp_cache(struct amdgpu_device *adev, uint64_t addr) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, regCP_PFP_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_PFP_IC_BASE_CNTL, VMID, 0); @@ -2517,7 +2517,7 @@ static int gfx_v11_0_config_mec_cache(struct amdgpu_device *adev, uint64_t addr) } if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); tmp = RREG32_SOC15(GC, 0, regCP_CPC_IC_BASE_CNTL); tmp = REG_SET_FIELD(tmp, CP_CPC_IC_BASE_CNTL, CACHE_POLICY, 0); @@ -3153,7 +3153,7 @@ static int gfx_v11_0_cp_gfx_load_pfp_microcode_rs64(struct amdgpu_device *adev) amdgpu_bo_unreserve(adev->gfx.pfp.pfp_fw_data_obj); if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); WREG32_SOC15(GC, 0, regCP_PFP_IC_BASE_LO, lower_32_bits(adev->gfx.pfp.pfp_fw_gpu_addr)); @@ -3371,7 +3371,7 @@ static int gfx_v11_0_cp_gfx_load_me_microcode_rs64(struct amdgpu_device *adev) amdgpu_bo_unreserve(adev->gfx.me.me_fw_data_obj); if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); WREG32_SOC15(GC, 0, regCP_ME_IC_BASE_LO, lower_32_bits(adev->gfx.me.me_fw_gpu_addr)); @@ -4541,7 +4541,7 @@ static int gfx_v11_0_gfxhub_enable(struct amdgpu_device *adev) if (r) return r; - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ? false : true; diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index e7b58e470292..62a257a4a3e9 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -2324,7 +2324,7 @@ static int gfx_v12_0_cp_gfx_load_pfp_microcode_rs64(struct amdgpu_device *adev) amdgpu_bo_unreserve(adev->gfx.pfp.pfp_fw_data_obj); if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); WREG32_SOC15(GC, 0, regCP_PFP_IC_BASE_LO, lower_32_bits(adev->gfx.pfp.pfp_fw_gpu_addr)); @@ -2468,7 +2468,7 @@ static int gfx_v12_0_cp_gfx_load_me_microcode_rs64(struct amdgpu_device *adev) amdgpu_bo_unreserve(adev->gfx.me.me_fw_data_obj); if (amdgpu_emu_mode == 1) - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); WREG32_SOC15(GC, 0, regCP_ME_IC_BASE_LO, lower_32_bits(adev->gfx.me.me_fw_gpu_addr)); @@ -3426,7 +3426,7 @@ static int gfx_v12_0_gfxhub_enable(struct amdgpu_device *adev) if (r) return r; - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ? false : true; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c index 95d894a231fc..809b3a882d0d 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c @@ -268,7 +268,7 @@ static void gmc_v10_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; /* flush hdp cache */ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal @@ -969,7 +969,7 @@ static int gmc_v10_0_gart_enable(struct amdgpu_device *adev) adev->hdp.funcs->init_registers(adev); /* Flush HDP after it is initialized */ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ? false : true; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c index ad099f136f84..e74e26b6a4f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v11_0.c @@ -229,7 +229,7 @@ static void gmc_v11_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, ack = hub->vm_inv_eng0_ack + hub->eng_distance * eng; /* flush hdp cache */ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal @@ -899,7 +899,7 @@ static int gmc_v11_0_gart_enable(struct amdgpu_device *adev) return r; /* Flush HDP after it is initialized */ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ? false : true; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c index 05c026d0b0d9..c6f290704d47 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v12_0.c @@ -297,7 +297,7 @@ static void gmc_v12_0_flush_gpu_tlb(struct amdgpu_device *adev, uint32_t vmid, return; /* flush hdp cache */ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); /* This is necessary for SRIOV as well as for GFXOFF to function * properly under bare metal @@ -881,7 +881,7 @@ static int gmc_v12_0_gart_enable(struct amdgpu_device *adev) return r; /* Flush HDP after it is initialized */ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); value = (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) ? false : true; diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c index 783e0c3b86b4..5effe8327d29 100644 --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c @@ -2435,7 +2435,7 @@ static int gmc_v9_0_hw_init(struct amdgpu_ip_block *ip_block) adev->hdp.funcs->init_registers(adev); /* After HDP is initialized, flush HDP.*/ - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); if (amdgpu_vm_fault_stop == AMDGPU_VM_FAULT_STOP_ALWAYS) value = false; diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c index bb5dfc410a66..215543575f47 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v11_0.c @@ -533,7 +533,7 @@ static int psp_v11_0_memory_training(struct psp_context *psp, uint32_t ops) } memcpy_toio(adev->mman.aper_base_kaddr, buf, sz); - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); vfree(buf); drm_dev_exit(idx); } else { diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c index cc621064610f..afdf8ce3b4c5 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v13_0.c @@ -610,7 +610,7 @@ static int psp_v13_0_memory_training(struct psp_context *psp, uint32_t ops) } memcpy_toio(adev->mman.aper_base_kaddr, buf, sz); - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); vfree(buf); drm_dev_exit(idx); } else { diff --git a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c index 7c49c3f3c388..256288c6cd78 100644 --- a/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c +++ b/drivers/gpu/drm/amd/amdgpu/psp_v14_0.c @@ -498,7 +498,7 @@ static int psp_v14_0_memory_training(struct psp_context *psp, uint32_t ops) } memcpy_toio(adev->mman.aper_base_kaddr, buf, sz); - adev->hdp.funcs->flush_hdp(adev, NULL); + amdgpu_device_flush_hdp(adev, NULL); vfree(buf); drm_dev_exit(idx); } else { -- cgit v1.2.3 From c9b19ea63036fc537a69265acea1b18dabd1cbd3 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 15 Apr 2025 09:56:59 +0200 Subject: dma-mapping: avoid potential unused data compilation warning When CONFIG_NEED_DMA_MAP_STATE is not defined, dma-mapping clients might report unused data compilation warnings for dma_unmap_*() calls arguments. Redefine macros for those calls to let compiler to notice that it is okay when the provided arguments are not used. Reported-by: Andy Shevchenko Suggested-by: Jakub Kicinski Signed-off-by: Marek Szyprowski Tested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250415075659.428549-1-m.szyprowski@samsung.com --- include/linux/dma-mapping.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index b79925b1c433..85ab710ec0e7 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -629,10 +629,14 @@ static inline int dma_mmap_wc(struct device *dev, #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) -#define dma_unmap_addr(PTR, ADDR_NAME) (0) -#define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define dma_unmap_len(PTR, LEN_NAME) (0) -#define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) +#define dma_unmap_addr(PTR, ADDR_NAME) \ + ({ typeof(PTR) __p __maybe_unused = PTR; 0; }) +#define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { typeof(PTR) __p __maybe_unused = PTR; } while (0) +#define dma_unmap_len(PTR, LEN_NAME) \ + ({ typeof(PTR) __p __maybe_unused = PTR; 0; }) +#define dma_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { typeof(PTR) __p __maybe_unused = PTR; } while (0) #endif #endif /* _LINUX_DMA_MAPPING_H */ -- cgit v1.2.3 From 8260731ccad0451207b45844bb66eb161a209218 Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 16 Apr 2025 08:57:45 +0200 Subject: drm/gem: Internally test import_attach for imported objects MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Test struct drm_gem_object.import_attach to detect imported objects. During object clenanup, the dma_buf field might be NULL. Testing it in an object's free callback then incorrectly does a cleanup as for native objects. Happens for calls to drm_mode_destroy_dumb_ioctl() that clears the dma_buf field in drm_gem_object_exported_dma_buf_free(). v3: - only test for import_attach (Boris) v2: - use import_attach.dmabuf instead of dma_buf (Christian) Signed-off-by: Thomas Zimmermann Fixes: b57aa47d39e9 ("drm/gem: Test for imported GEM buffers with helper") Reported-by: Andy Yan Closes: https://lore.kernel.org/dri-devel/38d09d34.4354.196379aa560.Coremail.andyshrk@163.com/ Tested-by: Andy Yan Cc: Thomas Zimmermann Cc: Anusha Srivatsa Cc: Christian König Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: David Airlie Cc: Simona Vetter Cc: Sumit Semwal Cc: "Christian König" Cc: dri-devel@lists.freedesktop.org Cc: linux-media@vger.kernel.org Cc: linaro-mm-sig@lists.linaro.org Reviewed-by: Boris Brezillon Reviewed-by: Simona Vetter Link: https://lore.kernel.org/r/20250416065820.26076-1-tzimmermann@suse.de --- include/drm/drm_gem.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/drm/drm_gem.h b/include/drm/drm_gem.h index 2bf893eabb4b..bcd54020d6ba 100644 --- a/include/drm/drm_gem.h +++ b/include/drm/drm_gem.h @@ -585,8 +585,7 @@ static inline bool drm_gem_object_is_shared_for_memory_stats(struct drm_gem_obje */ static inline bool drm_gem_is_imported(const struct drm_gem_object *obj) { - /* The dma-buf's priv field points to the original GEM object. */ - return obj->dma_buf && (obj->dma_buf->priv != obj); + return !!obj->import_attach; } #ifdef CONFIG_LOCKDEP -- cgit v1.2.3 From 0a65bc27bd645894175c059397b4916e31955fb2 Mon Sep 17 00:00:00 2001 From: Joe Damato Date: Wed, 16 Apr 2025 18:58:25 +0000 Subject: eventpoll: Set epoll timeout if it's in the future Avoid an edge case where epoll_wait arms a timer and calls schedule() even if the timer will expire immediately. For example: if the user has specified an epoll busy poll usecs which is equal or larger than the epoll_wait/epoll_pwait2 timeout, it is unnecessary to call schedule_hrtimeout_range; the busy poll usecs have consumed the entire timeout duration so it is unnecessary to induce scheduling latency by calling schedule() (via schedule_hrtimeout_range). This can be measured using a simple bpftrace script: tracepoint:sched:sched_switch / args->prev_pid == $1 / { print(kstack()); print(ustack()); } Before this patch is applied: Testing an epoll_wait app with busy poll usecs set to 1000, and epoll_wait timeout set to 1ms using the script above shows: __traceiter_sched_switch+69 __schedule+1495 schedule+32 schedule_hrtimeout_range+159 do_epoll_wait+1424 __x64_sys_epoll_wait+97 do_syscall_64+95 entry_SYSCALL_64_after_hwframe+118 epoll_wait+82 Which is unexpected; the busy poll usecs should have consumed the entire timeout and there should be no reason to arm a timer. After this patch is applied: the same test scenario does not generate a call to schedule() in the above edge case. If the busy poll usecs are reduced (for example usecs: 100, epoll_wait timeout 1ms) the timer is armed as expected. Fixes: bf3b9f6372c4 ("epoll: Add busy poll support to epoll with socket fds.") Signed-off-by: Joe Damato Link: https://lore.kernel.org/20250416185826.26375-1-jdamato@fastly.com Reviewed-by: Jan Kara Signed-off-by: Christian Brauner --- fs/eventpoll.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 100376863a44..4bc264b854c4 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1996,6 +1996,14 @@ static int ep_try_send_events(struct eventpoll *ep, return res; } +static int ep_schedule_timeout(ktime_t *to) +{ + if (to) + return ktime_after(*to, ktime_get()); + else + return 1; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2103,7 +2111,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); - if (!eavail) + if (!eavail && ep_schedule_timeout(to)) timed_out = !schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); -- cgit v1.2.3 From 76c332d119f9048c6e16b52359f401510f18b2ff Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Wed, 16 Apr 2025 10:38:05 +0200 Subject: drm/mgag200: Fix value in register MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix an off-by-one error when setting the vblanking start in . Commit d6460bd52c27 ("drm/mgag200: Add dedicated variables for blanking fields") switched the value from crtc_vdisplay to crtc_vblank_start, which DRM helpers copy from the former. The commit missed to subtract one though. Reported-by: Wakko Warner Closes: https://lore.kernel.org/dri-devel/CAMwc25rKPKooaSp85zDq2eh-9q4UPZD=RqSDBRp1fAagDnmRmA@mail.gmail.com/ Reported-by: Сергей Closes: https://lore.kernel.org/all/5b193b75-40b1-4342-a16a-ae9fc62f245a@gmail.com/ Closes: https://bbs.archlinux.org/viewtopic.php?id=303819 Signed-off-by: Thomas Zimmermann Fixes: d6460bd52c27 ("drm/mgag200: Add dedicated variables for blanking fields") Cc: Thomas Zimmermann Cc: Jocelyn Falempe Cc: Dave Airlie Cc: dri-devel@lists.freedesktop.org Cc: # v6.12+ Reviewed-by: Jocelyn Falempe Tested-by: Wakko Warner Link: https://lore.kernel.org/r/20250416083847.51764-1-tzimmermann@suse.de --- drivers/gpu/drm/mgag200/mgag200_mode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/mgag200/mgag200_mode.c b/drivers/gpu/drm/mgag200/mgag200_mode.c index fb71658c3117..6067d08aeee3 100644 --- a/drivers/gpu/drm/mgag200/mgag200_mode.c +++ b/drivers/gpu/drm/mgag200/mgag200_mode.c @@ -223,7 +223,7 @@ void mgag200_set_mode_regs(struct mga_device *mdev, const struct drm_display_mod vsyncstr = mode->crtc_vsync_start - 1; vsyncend = mode->crtc_vsync_end - 1; vtotal = mode->crtc_vtotal - 2; - vblkstr = mode->crtc_vblank_start; + vblkstr = mode->crtc_vblank_start - 1; vblkend = vtotal + 1; linecomp = vdispend; -- cgit v1.2.3 From a374f28700abd20e8a7d026f89aa26f759445918 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 17 Apr 2025 09:28:38 +0200 Subject: cpufreq: fix compile-test defaults Commit 3f66425a4fc8 ("cpufreq: Enable COMPILE_TEST on Arm drivers") enabled compile testing of most Arm CPUFreq drivers but left the existing default values unchanged so that many drivers are enabled by default whenever COMPILE_TEST is selected. This specifically results in the S3C64XX CPUFreq driver being enabled and initialised during boot of non-S3C64XX platforms with the following error logged: cpufreq: Unable to obtain ARMCLK: -2 Commit d4f610a9bafd ("cpufreq: Do not enable by default during compile testing") recently fixed most of the default values, but two entries were missed and two could use a more specific default condition. Fix the default values for drivers that can be compile tested and that should be enabled by default when not compile testing. Fixes: 3f66425a4fc8 ("cpufreq: Enable COMPILE_TEST on Arm drivers") Cc: Rob Herring (Arm) Signed-off-by: Johan Hovold Reviewed-by: Krzysztof Kozlowski Signed-off-by: Viresh Kumar --- drivers/cpufreq/Kconfig.arm | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/cpufreq/Kconfig.arm b/drivers/cpufreq/Kconfig.arm index d4d625ded285..0d46402e3094 100644 --- a/drivers/cpufreq/Kconfig.arm +++ b/drivers/cpufreq/Kconfig.arm @@ -76,7 +76,7 @@ config ARM_VEXPRESS_SPC_CPUFREQ config ARM_BRCMSTB_AVS_CPUFREQ tristate "Broadcom STB AVS CPUfreq driver" depends on (ARCH_BRCMSTB && !ARM_SCMI_CPUFREQ) || COMPILE_TEST - default ARCH_BRCMSTB + default y if ARCH_BRCMSTB && !ARM_SCMI_CPUFREQ help Some Broadcom STB SoCs use a co-processor running proprietary firmware ("AVS") to handle voltage and frequency scaling. This driver provides @@ -88,7 +88,7 @@ config ARM_HIGHBANK_CPUFREQ tristate "Calxeda Highbank-based" depends on ARCH_HIGHBANK || COMPILE_TEST depends on CPUFREQ_DT && REGULATOR && PL320_MBOX - default m + default m if ARCH_HIGHBANK help This adds the CPUFreq driver for Calxeda Highbank SoC based boards. @@ -133,7 +133,7 @@ config ARM_MEDIATEK_CPUFREQ config ARM_MEDIATEK_CPUFREQ_HW tristate "MediaTek CPUFreq HW driver" depends on ARCH_MEDIATEK || COMPILE_TEST - default m + default m if ARCH_MEDIATEK help Support for the CPUFreq HW driver. Some MediaTek chipsets have a HW engine to offload the steps @@ -256,7 +256,7 @@ config ARM_TEGRA194_CPUFREQ tristate "Tegra194 CPUFreq support" depends on ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC || (64BIT && COMPILE_TEST) depends on TEGRA_BPMP - default ARCH_TEGRA + default ARCH_TEGRA_194_SOC || ARCH_TEGRA_234_SOC help This adds CPU frequency driver support for Tegra194 SOCs. -- cgit v1.2.3 From 58db1c3cd0ce857e7210b0a95908900c25c28c3e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Apr 2025 15:16:55 -0700 Subject: netfs: Mark __nonstring lookup tables GCC 15's new -Wunterminated-string-initialization notices that the character lookup tables "fscache_cache_states" and "fscache_cookie_states" (which are not used as a C-String) need to be marked as "nonstring": fs/netfs/fscache_cache.c:375:67: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (6 chars into 5 available) [-Wunterminated-string-initialization] 375 | static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; | ^~~~~~~ fs/netfs/fscache_cookie.c:32:69: warning: initializer-string for array of 'char' truncates NUL terminator but destination lacks 'nonstring' attribute (11 chars into 10 available) [-Wunterminated-string-initialization] 32 | static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; | ^~~~~~~~~~~~ Annotate the arrays. Signed-off-by: Kees Cook Link: https://lore.kernel.org/20250416221654.work.028-kees@kernel.org Signed-off-by: Christian Brauner --- fs/netfs/fscache_cache.c | 2 +- fs/netfs/fscache_cookie.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0b4..8f70f8da064b 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache) EXPORT_SYMBOL(fscache_withdraw_cache); #ifdef CONFIG_PROC_FS -static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; +static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW"; /* * Generate a list of caches in /proc/fs/fscache/caches diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index d4d4b3a8b106..3d56fc73435f 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru); static DEFINE_SPINLOCK(fscache_cookie_lru_lock); DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); -static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; +static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD"; static unsigned int fscache_lru_cookie_timeout = 10 * HZ; void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) -- cgit v1.2.3 From 777d0961ff95b26d5887fdae69900374364976f3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 17 Apr 2025 08:40:42 +0200 Subject: fs: move the bdex_statx call to vfs_getattr_nosec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently bdex_statx is only called from the very high-level vfs_statx_path function, and thus bypassing it for in-kernel calls to vfs_getattr or vfs_getattr_nosec. This breaks querying the block ѕize of the underlying device in the loop driver and also is a pitfall for any other new kernel caller. Move the call into the lowest level helper to ensure all callers get the right results. Fixes: 2d985f8c6b91 ("vfs: support STATX_DIOALIGN on block devices") Fixes: f4774e92aab8 ("loop: take the file system minimum dio alignment into account") Reported-by: "Darrick J. Wong" Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250417064042.712140-1-hch@lst.de Signed-off-by: Christian Brauner --- block/bdev.c | 3 +-- fs/stat.c | 32 ++++++++++++++++++-------------- include/linux/blkdev.h | 6 +++--- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 4844d1e27b6f..6a34179192c9 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1272,8 +1272,7 @@ void sync_bdevs(bool wait) /* * Handle STATX_{DIOALIGN, WRITE_ATOMIC} for block devices. */ -void bdev_statx(struct path *path, struct kstat *stat, - u32 request_mask) +void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask) { struct inode *backing_inode; struct block_device *bdev; diff --git a/fs/stat.c b/fs/stat.c index f13308bfdc98..3d9222807214 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -204,12 +204,25 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, STATX_ATTR_DAX); idmap = mnt_idmap(path->mnt); - if (inode->i_op->getattr) - return inode->i_op->getattr(idmap, path, stat, - request_mask, - query_flags); + if (inode->i_op->getattr) { + int ret; + + ret = inode->i_op->getattr(idmap, path, stat, request_mask, + query_flags); + if (ret) + return ret; + } else { + generic_fillattr(idmap, request_mask, inode, stat); + } + + /* + * If this is a block device inode, override the filesystem attributes + * with the block device specific parameters that need to be obtained + * from the bdev backing inode. + */ + if (S_ISBLK(stat->mode)) + bdev_statx(path, stat, request_mask); - generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -295,15 +308,6 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, if (path_mounted(path)) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - - /* - * If this is a block device inode, override the filesystem - * attributes with the block device specific parameters that need to be - * obtained from the bdev backing inode. - */ - if (S_ISBLK(stat->mode)) - bdev_statx(path, stat, request_mask); - return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e39c45bc0a97..678dc38442bf 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1685,7 +1685,7 @@ int sync_blockdev(struct block_device *bdev); int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend); int sync_blockdev_nowait(struct block_device *bdev); void sync_bdevs(bool wait); -void bdev_statx(struct path *, struct kstat *, u32); +void bdev_statx(const struct path *path, struct kstat *stat, u32 request_mask); void printk_all_partitions(void); int __init early_lookup_bdev(const char *pathname, dev_t *dev); #else @@ -1703,8 +1703,8 @@ static inline int sync_blockdev_nowait(struct block_device *bdev) static inline void sync_bdevs(bool wait) { } -static inline void bdev_statx(struct path *path, struct kstat *stat, - u32 request_mask) +static inline void bdev_statx(const struct path *path, struct kstat *stat, + u32 request_mask) { } static inline void printk_all_partitions(void) -- cgit v1.2.3 From 50492f942c281af4a48f8028f8409d7b8f2655d9 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Wed, 16 Apr 2025 17:47:11 +0200 Subject: landlock: Fix documentation for landlock_create_ruleset(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move and fix the flags documentation, and improve formatting. It makes more sense and it eases maintenance to document syscall flags in landlock.h, where they are defined. This is already the case for landlock_restrict_self(2)'s flags. The flags are now rendered like the syscall's parameters and description. Cc: Günther Noack Cc: Paul Moore Link: https://lore.kernel.org/r/20250416154716.1799902-1-mic@digikod.net Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 14 +++++++++----- security/landlock/syscalls.c | 15 +++++++-------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index d9d0cb827117..9a4b64be9869 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -53,12 +53,16 @@ struct landlock_ruleset_attr { __u64 scoped; }; -/* - * sys_landlock_create_ruleset() flags: +/** + * DOC: landlock_create_ruleset_flags + * + * **Flags** + * + * %LANDLOCK_CREATE_RULESET_VERSION + * Get the highest supported Landlock ABI version (starting at 1). * - * - %LANDLOCK_CREATE_RULESET_VERSION: Get the highest supported Landlock ABI - * version. - * - %LANDLOCK_CREATE_RULESET_ERRATA: Get a bitmask of fixed issues. + * %LANDLOCK_CREATE_RULESET_ERRATA + * Get a bitmask of fixed issues for the current Landlock ABI version. */ /* clang-format off */ #define LANDLOCK_CREATE_RULESET_VERSION (1U << 0) diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index 54a9f29e6ebb..9515dc92b99f 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -169,20 +169,16 @@ const int landlock_abi_version = 7; * the new ruleset. * @size: Size of the pointed &struct landlock_ruleset_attr (needed for * backward and forward compatibility). - * @flags: Supported value: + * @flags: Supported values: + * * - %LANDLOCK_CREATE_RULESET_VERSION * - %LANDLOCK_CREATE_RULESET_ERRATA * * This system call enables to create a new Landlock ruleset, and returns the * related file descriptor on success. * - * If @flags is %LANDLOCK_CREATE_RULESET_VERSION and @attr is NULL and @size is - * 0, then the returned value is the highest supported Landlock ABI version - * (starting at 1). - * - * If @flags is %LANDLOCK_CREATE_RULESET_ERRATA and @attr is NULL and @size is - * 0, then the returned value is a bitmask of fixed issues for the current - * Landlock ABI version. + * If %LANDLOCK_CREATE_RULESET_VERSION or %LANDLOCK_CREATE_RULESET_ERRATA is + * set, then @attr must be NULL and @size must be 0. * * Possible returned errors are: * @@ -191,6 +187,9 @@ const int landlock_abi_version = 7; * - %E2BIG: @attr or @size inconsistencies; * - %EFAULT: @attr or @size inconsistencies; * - %ENOMSG: empty &landlock_ruleset_attr.handled_access_fs. + * + * .. kernel-doc:: include/uapi/linux/landlock.h + * :identifiers: landlock_create_ruleset_flags */ SYSCALL_DEFINE3(landlock_create_ruleset, const struct landlock_ruleset_attr __user *const, attr, -- cgit v1.2.3 From 25b1fc1cdc8931cf26e8d169f65ad07dfd653ca2 Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Wed, 16 Apr 2025 17:47:12 +0200 Subject: landlock: Fix documentation for landlock_restrict_self(2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix, deduplicate, and improve rendering of landlock_restrict_self(2)'s flags documentation. The flags are now rendered like the syscall's parameters and description. Cc: Günther Noack Cc: Paul Moore Link: https://lore.kernel.org/r/20250416154716.1799902-2-mic@digikod.net Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 61 +++++++++++++++++++++++++------------------ security/landlock/syscalls.c | 12 ++++----- 2 files changed, 42 insertions(+), 31 deletions(-) diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 9a4b64be9869..8b2a1dc5c70b 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -69,31 +69,42 @@ struct landlock_ruleset_attr { #define LANDLOCK_CREATE_RULESET_ERRATA (1U << 1) /* clang-format on */ -/* - * sys_landlock_restrict_self() flags: - * - * - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF: Do not create any log related to the - * enforced restrictions. This should only be set by tools launching unknown - * or untrusted programs (e.g. a sandbox tool, container runtime, system - * service manager). Because programs sandboxing themselves should fix any - * denied access, they should not set this flag to be aware of potential - * issues reported by system's logs (i.e. audit). - * - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON: Explicitly ask to continue - * logging denied access requests even after an :manpage:`execve(2)` call. - * This flag should only be set if all the programs than can legitimately be - * executed will not try to request a denied access (which could spam audit - * logs). - * - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF: Do not create any log related - * to the enforced restrictions coming from future nested domains created by - * the caller or its descendants. This should only be set according to a - * runtime configuration (i.e. not hardcoded) by programs launching other - * unknown or untrusted programs that may create their own Landlock domains - * and spam logs. The main use case is for container runtimes to enable users - * to mute buggy sandboxed programs for a specific container image. Other use - * cases include sandboxer tools and init systems. Unlike - * %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF, - * %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF does not impact the requested - * restriction (if any) but only the future nested domains. +/** + * DOC: landlock_restrict_self_flags + * + * **Flags** + * + * %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF + * Do not create any log related to the enforced restrictions. This should + * only be set by tools launching unknown or untrusted programs (e.g. a + * sandbox tool, container runtime, system service manager). Because + * programs sandboxing themselves should fix any denied access, they should + * not set this flag to be aware of potential issues reported by system's + * logs (i.e. audit). + * + * %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON + * Explicitly ask to continue logging denied access requests even after an + * :manpage:`execve(2)` call. This flag should only be set if all the + * programs than can legitimately be executed will not try to request a + * denied access (which could spam audit logs). + * + * %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF + * Do not create any log related to the enforced restrictions coming from + * future nested domains created by the caller or its descendants. This + * should only be set according to a runtime configuration (i.e. not + * hardcoded) by programs launching other unknown or untrusted programs that + * may create their own Landlock domains and spam logs. The main use case + * is for container runtimes to enable users to mute buggy sandboxed + * programs for a specific container image. Other use cases include + * sandboxer tools and init systems. Unlike + * ``LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF``, + * ``LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF`` does not impact the + * requested restriction (if any) but only the future nested domains. + * + * It is allowed to only pass the + * ``LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF`` flag with a @ruleset_fd + * value of -1. + * */ /* clang-format off */ #define LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF (1U << 0) diff --git a/security/landlock/syscalls.c b/security/landlock/syscalls.c index 9515dc92b99f..b9561e3417ae 100644 --- a/security/landlock/syscalls.c +++ b/security/landlock/syscalls.c @@ -451,18 +451,15 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd, * @ruleset_fd: File descriptor tied to the ruleset to merge with the target. * @flags: Supported values: * - * - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF - * - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON - * - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF + * - %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF + * - %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON + * - %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF * * This system call enables to enforce a Landlock ruleset on the current * thread. Enforcing a ruleset requires that the task has %CAP_SYS_ADMIN in its * namespace or is running with no_new_privs. This avoids scenarios where * unprivileged tasks can affect the behavior of privileged children. * - * It is allowed to only pass the %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF - * flag with a @ruleset_fd value of -1. - * * Possible returned errors are: * * - %EOPNOTSUPP: Landlock is supported by the kernel but disabled at boot time; @@ -474,6 +471,9 @@ SYSCALL_DEFINE4(landlock_add_rule, const int, ruleset_fd, * %CAP_SYS_ADMIN in its namespace. * - %E2BIG: The maximum number of stacked rulesets is reached for the current * thread. + * + * .. kernel-doc:: include/uapi/linux/landlock.h + * :identifiers: landlock_restrict_self_flags */ SYSCALL_DEFINE2(landlock_restrict_self, const int, ruleset_fd, const __u32, flags) -- cgit v1.2.3 From 47ce2af848b7301d8571f0e01a0d7c7162d51e4a Mon Sep 17 00:00:00 2001 From: Mickaël Salaün Date: Wed, 16 Apr 2025 17:47:13 +0200 Subject: landlock: Update log documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix and improve documentation related to landlock_restrict_self(2)'s flags. Update the LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF documentation according to the current semantic. Cc: Günther Noack Cc: Paul Moore Link: https://lore.kernel.org/r/20250416154716.1799902-3-mic@digikod.net Signed-off-by: Mickaël Salaün --- include/uapi/linux/landlock.h | 64 +++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 26 deletions(-) diff --git a/include/uapi/linux/landlock.h b/include/uapi/linux/landlock.h index 8b2a1dc5c70b..f030adc462ee 100644 --- a/include/uapi/linux/landlock.h +++ b/include/uapi/linux/landlock.h @@ -74,37 +74,49 @@ struct landlock_ruleset_attr { * * **Flags** * + * By default, denied accesses originating from programs that sandbox themselves + * are logged via the audit subsystem. Such events typically indicate unexpected + * behavior, such as bugs or exploitation attempts. However, to avoid excessive + * logging, access requests denied by a domain not created by the originating + * program are not logged by default. The rationale is that programs should know + * their own behavior, but not necessarily the behavior of other programs. This + * default configuration is suitable for most programs that sandbox themselves. + * For specific use cases, the following flags allow programs to modify this + * default logging behavior. + * + * The %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF and + * %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON flags apply to the newly created + * Landlock domain. + * * %LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF - * Do not create any log related to the enforced restrictions. This should - * only be set by tools launching unknown or untrusted programs (e.g. a - * sandbox tool, container runtime, system service manager). Because - * programs sandboxing themselves should fix any denied access, they should - * not set this flag to be aware of potential issues reported by system's - * logs (i.e. audit). + * Disables logging of denied accesses originating from the thread creating + * the Landlock domain, as well as its children, as long as they continue + * running the same executable code (i.e., without an intervening + * :manpage:`execve(2)` call). This is intended for programs that execute + * unknown code without invoking :manpage:`execve(2)`, such as script + * interpreters. Programs that only sandbox themselves should not set this + * flag, so users can be notified of unauthorized access attempts via system + * logs. * * %LANDLOCK_RESTRICT_SELF_LOG_NEW_EXEC_ON - * Explicitly ask to continue logging denied access requests even after an - * :manpage:`execve(2)` call. This flag should only be set if all the - * programs than can legitimately be executed will not try to request a - * denied access (which could spam audit logs). + * Enables logging of denied accesses after an :manpage:`execve(2)` call, + * providing visibility into unauthorized access attempts by newly executed + * programs within the created Landlock domain. This flag is recommended + * only when all potential executables in the domain are expected to comply + * with the access restrictions, as excessive audit log entries could make + * it more difficult to identify critical events. * * %LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF - * Do not create any log related to the enforced restrictions coming from - * future nested domains created by the caller or its descendants. This - * should only be set according to a runtime configuration (i.e. not - * hardcoded) by programs launching other unknown or untrusted programs that - * may create their own Landlock domains and spam logs. The main use case - * is for container runtimes to enable users to mute buggy sandboxed - * programs for a specific container image. Other use cases include - * sandboxer tools and init systems. Unlike - * ``LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF``, - * ``LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF`` does not impact the - * requested restriction (if any) but only the future nested domains. - * - * It is allowed to only pass the - * ``LANDLOCK_RESTRICT_SELF_LOG_SUBDOMAINS_OFF`` flag with a @ruleset_fd - * value of -1. - * + * Disables logging of denied accesses originating from nested Landlock + * domains created by the caller or its descendants. This flag should be set + * according to runtime configuration, not hardcoded, to avoid suppressing + * important security events. It is useful for container runtimes or + * sandboxing tools that may launch programs which themselves create + * Landlock domains and could otherwise generate excessive logs. Unlike + * ``LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF``, this flag only affects + * future nested domains, not the one being created. It can also be used + * with a @ruleset_fd value of -1 to mute subdomain logs without creating a + * domain. */ /* clang-format off */ #define LANDLOCK_RESTRICT_SELF_LOG_SAME_EXEC_OFF (1U << 0) -- cgit v1.2.3 From cf6ae7ed091059a8d1a70cf184f18ebfd18ab4af Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 26 Mar 2025 14:41:13 +1030 Subject: btrfs: subpage: access correct object when reading bitmap start in subpage_calc_start_bit() Inside the macro, subpage_calc_start_bit(), we need to calculate the offset to the beginning of the folio. But we're using offset_in_page(), on systems with 4K page size and 4K fs block size, this means we will always return offset 0 for a large folio, causing all kinds of errors. Fix it by using offset_in_folio() instead. Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 11dbd7be6a3b..bd252c78a261 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -204,7 +204,7 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ - __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ + __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) -- cgit v1.2.3 From bc2dbc4983afedd198490cca043798f57c93e9bf Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Sat, 29 Mar 2025 17:46:35 +1030 Subject: btrfs: avoid page_lockend underflow in btrfs_punch_hole_lock_range() [BUG] When running btrfs/004 with 4K fs block size and 64K page size, sometimes fsstress workload can take 100% CPU for a while, but not long enough to trigger a 120s hang warning. [CAUSE] When such 100% CPU usage happens, btrfs_punch_hole_lock_range() is always in the call trace. One example when this problem happens, the function btrfs_punch_hole_lock_range() got the following parameters: lock_start = 4096, lockend = 20469 Then we calculate @page_lockstart by rounding up lock_start to page boundary, which is 64K (page size is 64K). For @page_lockend, we round down the value towards page boundary, which result 0. Then since we need to pass an inclusive end to filemap_range_has_page(), we subtract 1 from the rounded down value, resulting in (u64)-1. In the above case, the range is inside the same page, and we do not even need to call filemap_range_has_page(), not to mention to call it with (u64)-1 at the end. This behavior will cause btrfs_punch_hole_lock_range() to busy loop waiting for irrelevant range to have its pages dropped. [FIX] Calculate @page_lockend by just rounding down @lockend, without decreasing the value by one. So @page_lockend will no longer overflow. Then exit early if @page_lockend is no larger than @page_lockstart. As it means either the range is inside the same page, or the two pages are adjacent already. Finally only decrease @page_lockend when calling filemap_range_has_page(). Fixes: 0528476b6ac7 ("btrfs: fix the filemap_range_has_page() call in btrfs_punch_hole_lock_range()") Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/file.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 262a707d8990..71b8a825c447 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2104,15 +2104,20 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE); while (1) { truncate_pagecache_range(inode, lockstart, lockend); lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + break; /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2124,7 +2129,7 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * we do, unlock the range and retry. */ if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + page_lockend - 1)) break; unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, -- cgit v1.2.3 From 7d82240c457fc15abdf7dedf15104cea774b005b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 1 Apr 2025 18:20:28 +1030 Subject: btrfs: fix the ASSERT() inside GET_SUBPAGE_BITMAP() After enabling large data folios for tests, I hit the ASSERT() inside GET_SUBPAGE_BITMAP() where blocks_per_folio matches BITS_PER_LONG. The ASSERT() itself is only based on the original subpage fs block size, where we have at most 16 blocks per page, thus "ASSERT(blocks_per_folio < BITS_PER_LONG)". However the experimental large data folio support will set the max folio order according to the BITS_PER_LONG, so we can have a case where a large folio contains exactly BITS_PER_LONG blocks. So the ASSERT() is too strict, change it to "ASSERT(blocks_per_folio <= BITS_PER_LONG)" to avoid the false alert. Reviewed-by: Filipe Manana Reviewed-by: Sweet Tea Dorminy Reviewed-by: Boris Burkov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/subpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index bd252c78a261..c0a0b8b063d0 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -666,7 +666,7 @@ IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, btrfs_blocks_per_folio(fs_info, folio); \ const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(blocks_per_folio < BITS_PER_LONG); \ + ASSERT(blocks_per_folio <= BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ blocks_per_folio * btrfs_bitmap_nr_##name, \ blocks_per_folio); \ -- cgit v1.2.3 From b0c26f47992672661340dd6ea931240213016609 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Mon, 17 Mar 2025 16:04:01 +0100 Subject: btrfs: zoned: return EIO on RAID1 block group write pointer mismatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There was a bug report about a NULL pointer dereference in __btrfs_add_free_space_zoned() that ultimately happens because a conversion from the default metadata profile DUP to a RAID1 profile on two disks. The stack trace has the following signature: BTRFS error (device sdc): zoned: write pointer offset mismatch of zones in raid1 profile BUG: kernel NULL pointer dereference, address: 0000000000000058 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:__btrfs_add_free_space_zoned.isra.0+0x61/0x1a0 RSP: 0018:ffffa236b6f3f6d0 EFLAGS: 00010246 RAX: 0000000000000000 RBX: ffff96c8132f3400 RCX: 0000000000000001 RDX: 0000000010000000 RSI: 0000000000000000 RDI: ffff96c8132f3410 RBP: 0000000010000000 R08: 0000000000000003 R09: 0000000000000000 R10: 0000000000000000 R11: 00000000ffffffff R12: 0000000000000000 R13: ffff96c758f65a40 R14: 0000000000000001 R15: 000011aac0000000 FS: 00007fdab1cb2900(0000) GS:ffff96e60ca00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000058 CR3: 00000001a05ae000 CR4: 0000000000350ef0 Call Trace: ? __die_body.cold+0x19/0x27 ? page_fault_oops+0x15c/0x2f0 ? exc_page_fault+0x7e/0x180 ? asm_exc_page_fault+0x26/0x30 ? __btrfs_add_free_space_zoned.isra.0+0x61/0x1a0 btrfs_add_free_space_async_trimmed+0x34/0x40 btrfs_add_new_free_space+0x107/0x120 btrfs_make_block_group+0x104/0x2b0 btrfs_create_chunk+0x977/0xf20 btrfs_chunk_alloc+0x174/0x510 ? srso_return_thunk+0x5/0x5f btrfs_inc_block_group_ro+0x1b1/0x230 btrfs_relocate_block_group+0x9e/0x410 btrfs_relocate_chunk+0x3f/0x130 btrfs_balance+0x8ac/0x12b0 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? __kmalloc_cache_noprof+0x14c/0x3e0 btrfs_ioctl+0x2686/0x2a80 ? srso_return_thunk+0x5/0x5f ? ioctl_has_perm.constprop.0.isra.0+0xd2/0x120 __x64_sys_ioctl+0x97/0xc0 do_syscall_64+0x82/0x160 ? srso_return_thunk+0x5/0x5f ? __memcg_slab_free_hook+0x11a/0x170 ? srso_return_thunk+0x5/0x5f ? kmem_cache_free+0x3f0/0x450 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? syscall_exit_to_user_mode+0x10/0x210 ? srso_return_thunk+0x5/0x5f ? do_syscall_64+0x8e/0x160 ? sysfs_emit+0xaf/0xc0 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? seq_read_iter+0x207/0x460 ? srso_return_thunk+0x5/0x5f ? vfs_read+0x29c/0x370 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? syscall_exit_to_user_mode+0x10/0x210 ? srso_return_thunk+0x5/0x5f ? do_syscall_64+0x8e/0x160 ? srso_return_thunk+0x5/0x5f ? exc_page_fault+0x7e/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7fdab1e0ca6d RSP: 002b:00007ffeb2b60c80 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007fdab1e0ca6d RDX: 00007ffeb2b60d80 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 00007ffeb2b60cd0 R08: 0000000000000000 R09: 0000000000000013 R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 R13: 00007ffeb2b6343b R14: 00007ffeb2b60d80 R15: 0000000000000001 CR2: 0000000000000058 ---[ end trace 0000000000000000 ]--- The 1st line is the most interesting here: BTRFS error (device sdc): zoned: write pointer offset mismatch of zones in raid1 profile When a RAID1 block-group is created and a write pointer mismatch between the disks in the RAID set is detected, btrfs sets the alloc_offset to the length of the block group marking it as full. Afterwards the code expects that a balance operation will evacuate the data in this block-group and repair the problems. But before this is possible, the new space of this block-group will be accounted in the free space cache. But in __btrfs_add_free_space_zoned() it is being checked if it is a initial creation of a block group and if not a reclaim decision will be made. But the decision if a block-group's free space accounting is done for an initial creation depends on if the size of the added free space is the whole length of the block-group and the allocation offset is 0. But as btrfs_load_block_group_zone_info() sets the allocation offset to the zone capacity (i.e. marking the block-group as full) this initial decision is not met, and the space_info pointer in the 'struct btrfs_block_group' has not yet been assigned. Fail creation of the block group and rely on manual user intervention to re-balance the filesystem. Afterwards the filesystem can be unmounted, mounted in degraded mode and the missing device can be removed after a full balance of the filesystem. Reported-by: 西木野羰基 Link: https://lore.kernel.org/linux-btrfs/CAB_b4sBhDe3tscz=duVyhc9hNE+gu=B8CrgLO152uMyanR8BEA@mail.gmail.com/ Fixes: b1934cd60695 ("btrfs: zoned: handle broken write pointer on zones") Reviewed-by: Anand Jain Signed-off-by: Johannes Thumshirn Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index fb8b8b29c169..7c502192cd6b 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1659,7 +1659,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: -- cgit v1.2.3 From 50fecb8cf069f0814642ce0bde965bdc1f35a79e Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 5 Apr 2025 18:40:14 +0100 Subject: btrfs: fix invalid inode pointer after failure to create reloc inode If we have a failure at create_reloc_inode(), under the 'out' label we assign an error pointer to the 'inode' variable and then return a weird pointer because we return the expression "&inode->vfs_inode": static noinline_for_stack struct inode *create_reloc_inode( const struct btrfs_block_group *group) { (...) out: (...) if (ret) { if (inode) iput(&inode->vfs_inode); inode = ERR_PTR(ret); } return &inode->vfs_inode; } This can make us return a pointer that is not an error pointer and make the caller proceed as if an error didn't happen and later result in an invalid memory access when dereferencing the inode pointer. Syzbot reported reported such a case with the following stack trace: R10: 0000000000000002 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 431bde82d7b634db R15: 00007ffc55de5790 BTRFS info (device loop0): relocating block group 6881280 flags data|metadata Oops: general protection fault, probably for non-canonical address 0xdffffc0000000045: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000228-0x000000000000022f] CPU: 0 UID: 0 PID: 5332 Comm: syz-executor215 Not tainted 6.14.0-syzkaller-13423-ga8662bcd2ff1 #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 RIP: 0010:relocate_file_extent_cluster+0xe7/0x1750 fs/btrfs/relocation.c:2971 Code: 00 74 08 (...) RSP: 0018:ffffc9000d3375e0 EFLAGS: 00010203 RAX: 0000000000000045 RBX: 000000000000022c RCX: ffff888000562440 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880452db000 RBP: ffffc9000d337870 R08: ffffffff84089251 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffffffff9368a020 R14: 0000000000000394 R15: ffff8880452db000 FS: 000055558bc7b380(0000) GS:ffff88808c596000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055a7a192e740 CR3: 0000000036e2e000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: relocate_block_group+0xa1e/0xd50 fs/btrfs/relocation.c:3657 btrfs_relocate_block_group+0x777/0xd80 fs/btrfs/relocation.c:4011 btrfs_relocate_chunk+0x12c/0x3b0 fs/btrfs/volumes.c:3511 __btrfs_balance+0x1a93/0x25e0 fs/btrfs/volumes.c:4292 btrfs_balance+0xbde/0x10c0 fs/btrfs/volumes.c:4669 btrfs_ioctl_balance+0x3f5/0x660 fs/btrfs/ioctl.c:3586 vfs_ioctl fs/ioctl.c:51 [inline] __do_sys_ioctl fs/ioctl.c:906 [inline] __se_sys_ioctl+0xf1/0x160 fs/ioctl.c:892 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf3/0x230 arch/x86/entry/syscall_64.c:94 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7fb4ef537dd9 Code: 28 00 00 (...) RSP: 002b:00007ffc55de5728 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 00007ffc55de5750 RCX: 00007fb4ef537dd9 RDX: 0000200000000440 RSI: 00000000c4009420 RDI: 0000000000000003 RBP: 0000000000000002 R08: 00007ffc55de54c6 R09: 00007ffc55de5770 R10: 0000000000000002 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 431bde82d7b634db R15: 00007ffc55de5790 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:relocate_file_extent_cluster+0xe7/0x1750 fs/btrfs/relocation.c:2971 Code: 00 74 08 (...) RSP: 0018:ffffc9000d3375e0 EFLAGS: 00010203 RAX: 0000000000000045 RBX: 000000000000022c RCX: ffff888000562440 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff8880452db000 RBP: ffffc9000d337870 R08: ffffffff84089251 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: dffffc0000000000 R13: ffffffff9368a020 R14: 0000000000000394 R15: ffff8880452db000 FS: 000055558bc7b380(0000) GS:ffff88808c596000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055a7a192e740 CR3: 0000000036e2e000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 ---------------- Code disassembly (best guess): 0: 00 74 08 48 add %dh,0x48(%rax,%rcx,1) 4: 89 df mov %ebx,%edi 6: e8 f8 36 24 fe call 0xfe243703 b: 48 89 9c 24 30 01 00 mov %rbx,0x130(%rsp) 12: 00 13: 4c 89 74 24 28 mov %r14,0x28(%rsp) 18: 4d 8b 76 10 mov 0x10(%r14),%r14 1c: 49 8d 9e 98 fe ff ff lea -0x168(%r14),%rbx 23: 48 89 d8 mov %rbx,%rax 26: 48 c1 e8 03 shr $0x3,%rax * 2a: 42 80 3c 20 00 cmpb $0x0,(%rax,%r12,1) <-- trapping instruction 2f: 74 08 je 0x39 31: 48 89 df mov %rbx,%rdi 34: e8 ca 36 24 fe call 0xfe243703 39: 4c 8b 3b mov (%rbx),%r15 3c: 48 rex.W 3d: 8b .byte 0x8b 3e: 44 rex.R 3f: 24 .byte 0x24 So fix this by returning the error immediately. Reported-by: syzbot+7481815bb47ef3e702e2@syzkaller.appspotmail.com Link: https://lore.kernel.org/linux-btrfs/67f14ee9.050a0220.0a13.023e.GAE@google.com/ Fixes: b204e5c7d4dc ("btrfs: make btrfs_iget() return a btrfs inode instead") Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f948f4f6431c..e17bcb034595 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3803,7 +3803,7 @@ out: if (ret) { if (inode) iput(&inode->vfs_inode); - inode = ERR_PTR(ret); + return ERR_PTR(ret); } return &inode->vfs_inode; } -- cgit v1.2.3 From f1ab0171e9be96fd530329fa54761cff5e09ea95 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Fri, 4 Apr 2025 20:19:41 +0200 Subject: btrfs: tree-checker: adjust error code for header level check The whole tree checker returns EUCLEAN, except the one check in btrfs_verify_level_key(). This was inherited from the function that was moved from disk-io.c in 2cac5af16537 ("btrfs: move btrfs_verify_level_key into tree-checker.c") but this should be unified with the rest. Reviewed-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..2b66a6130269 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -2235,7 +2235,7 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) -- cgit v1.2.3 From c1a79b1a583654f24b17da81ba868b0064077243 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 19 Mar 2025 10:49:16 +0900 Subject: block: introduce zone capacity helper {bdev,disk}_zone_capacity() takes block_device or gendisk and sector position and returns the zone capacity of the corresponding zone. With that, move disk_nr_zones() and blk_zone_plug_bio() to consolidate them in the same #ifdef block. Signed-off-by: Naohiro Aota Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Reviewed-by: Chaitanya Kulkarni Signed-off-by: David Sterba --- include/linux/blkdev.h | 67 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 22 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d37751789bf5..c57babb0adb9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -691,23 +691,6 @@ static inline bool blk_queue_is_zoned(struct request_queue *q) (q->limits.features & BLK_FEAT_ZONED); } -#ifdef CONFIG_BLK_DEV_ZONED -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return disk->nr_zones; -} -bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); -#else /* CONFIG_BLK_DEV_ZONED */ -static inline unsigned int disk_nr_zones(struct gendisk *disk) -{ - return 0; -} -static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) -{ - return false; -} -#endif /* CONFIG_BLK_DEV_ZONED */ - static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) { if (!blk_queue_is_zoned(disk->queue)) @@ -715,11 +698,6 @@ static inline unsigned int disk_zone_no(struct gendisk *disk, sector_t sector) return sector >> ilog2(disk->queue->limits.chunk_sectors); } -static inline unsigned int bdev_nr_zones(struct block_device *bdev) -{ - return disk_nr_zones(bdev->bd_disk); -} - static inline unsigned int bdev_max_open_zones(struct block_device *bdev) { return bdev->bd_disk->queue->limits.max_open_zones; @@ -826,6 +804,51 @@ static inline u64 sb_bdev_nr_blocks(struct super_block *sb) (sb->s_blocksize_bits - SECTOR_SHIFT); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return disk->nr_zones; +} +bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs); + +/** + * disk_zone_capacity - returns the zone capacity of zone containing @sector + * @disk: disk to work with + * @sector: sector number within the querying zone + * + * Returns the zone capacity of a zone containing @sector. @sector can be any + * sector in the zone. + */ +static inline unsigned int disk_zone_capacity(struct gendisk *disk, + sector_t sector) +{ + sector_t zone_sectors = disk->queue->limits.chunk_sectors; + + if (sector + zone_sectors >= get_capacity(disk)) + return disk->last_zone_capacity; + return disk->zone_capacity; +} +static inline unsigned int bdev_zone_capacity(struct block_device *bdev, + sector_t pos) +{ + return disk_zone_capacity(bdev->bd_disk, pos); +} +#else /* CONFIG_BLK_DEV_ZONED */ +static inline unsigned int disk_nr_zones(struct gendisk *disk) +{ + return 0; +} +static inline bool blk_zone_plug_bio(struct bio *bio, unsigned int nr_segs) +{ + return false; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + +static inline unsigned int bdev_nr_zones(struct block_device *bdev) +{ + return disk_nr_zones(bdev->bd_disk); +} + int bdev_disk_changed(struct gendisk *disk, bool invalidate); void put_disk(struct gendisk *disk); -- cgit v1.2.3 From 866bafae59ecffcf1840d846cd79740be29f21d6 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Wed, 19 Mar 2025 10:49:17 +0900 Subject: btrfs: zoned: skip reporting zone for new block group There is a potential deadlock if we do report zones in an IO context, detailed in below lockdep report. When one process do a report zones and another process freezes the block device, the report zones side cannot allocate a tag because the freeze is already started. This can thus result in new block group creation to hang forever, blocking the write path. Thankfully, a new block group should be created on empty zones. So, reporting the zones is not necessary and we can set the write pointer = 0 and load the zone capacity from the block layer using bdev_zone_capacity() helper. ====================================================== WARNING: possible circular locking dependency detected 6.14.0-rc1 #252 Not tainted ------------------------------------------------------ modprobe/1110 is trying to acquire lock: ffff888100ac83e0 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}, at: __flush_work+0x38f/0xb60 but task is already holding lock: ffff8881205b6f20 (&q->q_usage_counter(queue)#16){++++}-{0:0}, at: sd_remove+0x85/0x130 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (&q->q_usage_counter(queue)#16){++++}-{0:0}: blk_queue_enter+0x3d9/0x500 blk_mq_alloc_request+0x47d/0x8e0 scsi_execute_cmd+0x14f/0xb80 sd_zbc_do_report_zones+0x1c1/0x470 sd_zbc_report_zones+0x362/0xd60 blkdev_report_zones+0x1b1/0x2e0 btrfs_get_dev_zones+0x215/0x7e0 [btrfs] btrfs_load_block_group_zone_info+0x6d2/0x2c10 [btrfs] btrfs_make_block_group+0x36b/0x870 [btrfs] btrfs_create_chunk+0x147d/0x2320 [btrfs] btrfs_chunk_alloc+0x2ce/0xcf0 [btrfs] start_transaction+0xce6/0x1620 [btrfs] btrfs_uuid_scan_kthread+0x4ee/0x5b0 [btrfs] kthread+0x39d/0x750 ret_from_fork+0x30/0x70 ret_from_fork_asm+0x1a/0x30 -> #2 (&fs_info->dev_replace.rwsem){++++}-{4:4}: down_read+0x9b/0x470 btrfs_map_block+0x2ce/0x2ce0 [btrfs] btrfs_submit_chunk+0x2d4/0x16c0 [btrfs] btrfs_submit_bbio+0x16/0x30 [btrfs] btree_write_cache_pages+0xb5a/0xf90 [btrfs] do_writepages+0x17f/0x7b0 __writeback_single_inode+0x114/0xb00 writeback_sb_inodes+0x52b/0xe00 wb_writeback+0x1a7/0x800 wb_workfn+0x12a/0xbd0 process_one_work+0x85a/0x1460 worker_thread+0x5e2/0xfc0 kthread+0x39d/0x750 ret_from_fork+0x30/0x70 ret_from_fork_asm+0x1a/0x30 -> #1 (&fs_info->zoned_meta_io_lock){+.+.}-{4:4}: __mutex_lock+0x1aa/0x1360 btree_write_cache_pages+0x252/0xf90 [btrfs] do_writepages+0x17f/0x7b0 __writeback_single_inode+0x114/0xb00 writeback_sb_inodes+0x52b/0xe00 wb_writeback+0x1a7/0x800 wb_workfn+0x12a/0xbd0 process_one_work+0x85a/0x1460 worker_thread+0x5e2/0xfc0 kthread+0x39d/0x750 ret_from_fork+0x30/0x70 ret_from_fork_asm+0x1a/0x30 -> #0 ((work_completion)(&(&wb->dwork)->work)){+.+.}-{0:0}: __lock_acquire+0x2f52/0x5ea0 lock_acquire+0x1b1/0x540 __flush_work+0x3ac/0xb60 wb_shutdown+0x15b/0x1f0 bdi_unregister+0x172/0x5b0 del_gendisk+0x841/0xa20 sd_remove+0x85/0x130 device_release_driver_internal+0x368/0x520 bus_remove_device+0x1f1/0x3f0 device_del+0x3bd/0x9c0 __scsi_remove_device+0x272/0x340 scsi_forget_host+0xf7/0x170 scsi_remove_host+0xd2/0x2a0 sdebug_driver_remove+0x52/0x2f0 [scsi_debug] device_release_driver_internal+0x368/0x520 bus_remove_device+0x1f1/0x3f0 device_del+0x3bd/0x9c0 device_unregister+0x13/0xa0 sdebug_do_remove_host+0x1fb/0x290 [scsi_debug] scsi_debug_exit+0x17/0x70 [scsi_debug] __do_sys_delete_module.isra.0+0x321/0x520 do_syscall_64+0x93/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e other info that might help us debug this: Chain exists of: (work_completion)(&(&wb->dwork)->work) --> &fs_info->dev_replace.rwsem --> &q->q_usage_counter(queue)#16 Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&q->q_usage_counter(queue)#16); lock(&fs_info->dev_replace.rwsem); lock(&q->q_usage_counter(queue)#16); lock((work_completion)(&(&wb->dwork)->work)); *** DEADLOCK *** 5 locks held by modprobe/1110: #0: ffff88811f7bc108 (&dev->mutex){....}-{4:4}, at: device_release_driver_internal+0x8f/0x520 #1: ffff8881022ee0e0 (&shost->scan_mutex){+.+.}-{4:4}, at: scsi_remove_host+0x20/0x2a0 #2: ffff88811b4c4378 (&dev->mutex){....}-{4:4}, at: device_release_driver_internal+0x8f/0x520 #3: ffff8881205b6f20 (&q->q_usage_counter(queue)#16){++++}-{0:0}, at: sd_remove+0x85/0x130 #4: ffffffffa3284360 (rcu_read_lock){....}-{1:3}, at: __flush_work+0xda/0xb60 stack backtrace: CPU: 0 UID: 0 PID: 1110 Comm: modprobe Not tainted 6.14.0-rc1 #252 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 Call Trace: dump_stack_lvl+0x6a/0x90 print_circular_bug.cold+0x1e0/0x274 check_noncircular+0x306/0x3f0 ? __pfx_check_noncircular+0x10/0x10 ? mark_lock+0xf5/0x1650 ? __pfx_check_irq_usage+0x10/0x10 ? lockdep_lock+0xca/0x1c0 ? __pfx_lockdep_lock+0x10/0x10 __lock_acquire+0x2f52/0x5ea0 ? __pfx___lock_acquire+0x10/0x10 ? __pfx_mark_lock+0x10/0x10 lock_acquire+0x1b1/0x540 ? __flush_work+0x38f/0xb60 ? __pfx_lock_acquire+0x10/0x10 ? __pfx_lock_release+0x10/0x10 ? mark_held_locks+0x94/0xe0 ? __flush_work+0x38f/0xb60 __flush_work+0x3ac/0xb60 ? __flush_work+0x38f/0xb60 ? __pfx_mark_lock+0x10/0x10 ? __pfx___flush_work+0x10/0x10 ? __pfx_wq_barrier_func+0x10/0x10 ? __pfx___might_resched+0x10/0x10 ? mark_held_locks+0x94/0xe0 wb_shutdown+0x15b/0x1f0 bdi_unregister+0x172/0x5b0 ? __pfx_bdi_unregister+0x10/0x10 ? up_write+0x1ba/0x510 del_gendisk+0x841/0xa20 ? __pfx_del_gendisk+0x10/0x10 ? _raw_spin_unlock_irqrestore+0x35/0x60 ? __pm_runtime_resume+0x79/0x110 sd_remove+0x85/0x130 device_release_driver_internal+0x368/0x520 ? kobject_put+0x5d/0x4a0 bus_remove_device+0x1f1/0x3f0 device_del+0x3bd/0x9c0 ? __pfx_device_del+0x10/0x10 __scsi_remove_device+0x272/0x340 scsi_forget_host+0xf7/0x170 scsi_remove_host+0xd2/0x2a0 sdebug_driver_remove+0x52/0x2f0 [scsi_debug] ? kernfs_remove_by_name_ns+0xc0/0xf0 device_release_driver_internal+0x368/0x520 ? kobject_put+0x5d/0x4a0 bus_remove_device+0x1f1/0x3f0 device_del+0x3bd/0x9c0 ? __pfx_device_del+0x10/0x10 ? __pfx___mutex_unlock_slowpath+0x10/0x10 device_unregister+0x13/0xa0 sdebug_do_remove_host+0x1fb/0x290 [scsi_debug] scsi_debug_exit+0x17/0x70 [scsi_debug] __do_sys_delete_module.isra.0+0x321/0x520 ? __pfx___do_sys_delete_module.isra.0+0x10/0x10 ? __pfx_slab_free_after_rcu_debug+0x10/0x10 ? kasan_save_stack+0x2c/0x50 ? kasan_record_aux_stack+0xa3/0xb0 ? __call_rcu_common.constprop.0+0xc4/0xfb0 ? kmem_cache_free+0x3a0/0x590 ? __x64_sys_close+0x78/0xd0 do_syscall_64+0x93/0x180 ? lock_is_held_type+0xd5/0x130 ? __call_rcu_common.constprop.0+0x3c0/0xfb0 ? lockdep_hardirqs_on+0x78/0x100 ? __call_rcu_common.constprop.0+0x3c0/0xfb0 ? __pfx___call_rcu_common.constprop.0+0x10/0x10 ? kmem_cache_free+0x3a0/0x590 ? lockdep_hardirqs_on_prepare+0x16d/0x400 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on+0x78/0x100 ? do_syscall_64+0x9f/0x180 ? __pfx___x64_sys_openat+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x16d/0x400 ? do_syscall_64+0x9f/0x180 ? lockdep_hardirqs_on+0x78/0x100 ? do_syscall_64+0x9f/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f436712b68b RSP: 002b:00007ffe9f1a8658 EFLAGS: 00000206 ORIG_RAX: 00000000000000b0 RAX: ffffffffffffffda RBX: 00005559b367fd80 RCX: 00007f436712b68b RDX: 0000000000000000 RSI: 0000000000000800 RDI: 00005559b367fde8 RBP: 00007ffe9f1a8680 R08: 1999999999999999 R09: 0000000000000000 R10: 00007f43671a5fe0 R11: 0000000000000206 R12: 0000000000000000 R13: 00007ffe9f1a86b0 R14: 0000000000000000 R15: 0000000000000000 Reported-by: Shin'ichiro Kawasaki CC: # 6.13+ Tested-by: Shin'ichiro Kawasaki Reviewed-by: Damien Le Moal Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba --- fs/btrfs/zoned.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 7c502192cd6b..4a3e02b49f29 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1277,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; -- cgit v1.2.3 From 87c259a7a359e73e6c52c68fcbec79988999b4e6 Mon Sep 17 00:00:00 2001 From: gaoxu Date: Thu, 17 Apr 2025 07:30:00 +0000 Subject: cgroup: Fix compilation issue due to cgroup_mutex not being exported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When adding folio_memcg function call in the zram module for Android16-6.12, the following error occurs during compilation: ERROR: modpost: "cgroup_mutex" [../soc-repo/zram.ko] undefined! This error is caused by the indirect call to lockdep_is_held(&cgroup_mutex) within folio_memcg. The export setting for cgroup_mutex is controlled by the CONFIG_PROVE_RCU macro. If CONFIG_LOCKDEP is enabled while CONFIG_PROVE_RCU is not, this compilation error will occur. To resolve this issue, add a parallel macro CONFIG_LOCKDEP control to ensure cgroup_mutex is properly exported when needed. Signed-off-by: gao xu Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 3caf2cd86e65..9c1bf7f7c812 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,7 +90,7 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif -- cgit v1.2.3 From 1bf67c8fdbda21fadd564a12dbe2b13c1ea5eda7 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Wed, 16 Apr 2025 21:17:51 +0000 Subject: cgroup/cpuset-v1: Add missing support for cpuset_v2_mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Android has mounted the v1 cpuset controller using filesystem type "cpuset" (not "cgroup") since 2015 [1], and depends on the resulting behavior where the controller name is not added as a prefix for cgroupfs files. [2] Later, a problem was discovered where cpu hotplug onlining did not affect the cpuset/cpus files, which Android carried an out-of-tree patch to address for a while. An attempt was made to upstream this patch, but the recommendation was to use the "cpuset_v2_mode" mount option instead. [3] An effort was made to do so, but this fails with "cgroup: Unknown parameter 'cpuset_v2_mode'" because commit e1cba4b85daa ("cgroup: Add mount flag to enable cpuset to use v2 behavior in v1 cgroup") did not update the special cased cpuset_mount(), and only the cgroup (v1) filesystem type was updated. Add parameter parsing to the cpuset filesystem type so that cpuset_v2_mode works like the cgroup filesystem type: $ mkdir /dev/cpuset $ mount -t cpuset -ocpuset_v2_mode none /dev/cpuset $ mount|grep cpuset none on /dev/cpuset type cgroup (rw,relatime,cpuset,noprefix,cpuset_v2_mode,release_agent=/sbin/cpuset_release_agent) [1] https://cs.android.com/android/_/android/platform/system/core/+/b769c8d24fd7be96f8968aa4c80b669525b930d3 [2] https://cs.android.com/android/platform/superproject/main/+/main:system/core/libprocessgroup/setup/cgroup_map_write.cpp;drc=2dac5d89a0f024a2d0cc46a80ba4ee13472f1681;l=192 [3] https://lore.kernel.org/lkml/f795f8be-a184-408a-0b5a-553d26061385@redhat.com/T/ Fixes: e1cba4b85daa ("cgroup: Add mount flag to enable cpuset to use v2 behavior in v1 cgroup") Signed-off-by: T.J. Mercier Acked-by: Waiman Long Reviewed-by: Kamalesh Babulal Acked-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 9c1bf7f7c812..63e5b90da1f3 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2353,9 +2353,37 @@ static struct file_system_type cgroup2_fs_type = { }; #ifdef CONFIG_CPUSETS_V1 +enum cpuset_param { + Opt_cpuset_v2_mode, +}; + +static const struct fs_parameter_spec cpuset_fs_parameters[] = { + fsparam_flag ("cpuset_v2_mode", Opt_cpuset_v2_mode), + {} +}; + +static int cpuset_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct cgroup_fs_context *ctx = cgroup_fc2context(fc); + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, cpuset_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cpuset_v2_mode: + ctx->flags |= CGRP_ROOT_CPUSET_V2_MODE; + return 0; + } + return -EINVAL; +} + static const struct fs_context_operations cpuset_fs_context_ops = { .get_tree = cgroup1_get_tree, .free = cgroup_fs_context_free, + .parse_param = cpuset_parse_param, }; /* @@ -2392,6 +2420,7 @@ static int cpuset_init_fs_context(struct fs_context *fc) static struct file_system_type cpuset_fs_type = { .name = "cpuset", .init_fs_context = cpuset_init_fs_context, + .parameters = cpuset_fs_parameters, .fs_flags = FS_USERNS_MOUNT, }; #endif -- cgit v1.2.3 From 31d1139956112dd047a70b263f4d578921de779a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 10:40:17 -0400 Subject: ftrace: Initialize variables for ftrace_startup/shutdown_subops() The reworking to fix and simplify the ftrace_startup_subops() and the ftrace_shutdown_subops() made it possible for the filter_hash and notrace_hash variables to be used uninitialized in a way that the compiler did not catch it. Initialize both filter_hash and notrace_hash to the EMPTY_HASH as that is what they should be if they never are used. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250417104017.3aea66c2@gandalf.local.home Reported-by: Venkat Rao Bagalkote Tested-by: Venkat Rao Bagalkote Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Closes: https://lore.kernel.org/all/1db64a42-626d-4b3a-be08-c65e47333ce2@linux.ibm.com/ Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a8a02868b435..43394445390c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3490,8 +3490,8 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** */ int ftrace_startup_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; struct ftrace_hash *save_filter_hash; struct ftrace_hash *save_notrace_hash; int ret; @@ -3625,8 +3625,8 @@ static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash * */ int ftrace_shutdown_subops(struct ftrace_ops *ops, struct ftrace_ops *subops, int command) { - struct ftrace_hash *filter_hash; - struct ftrace_hash *notrace_hash; + struct ftrace_hash *filter_hash = EMPTY_HASH; + struct ftrace_hash *notrace_hash = EMPTY_HASH; int ret; if (unlikely(ftrace_disabled)) -- cgit v1.2.3 From 08275e59a75047ba8fc0b9853bfdfc88a124763d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 11:09:33 -0400 Subject: ftrace: Reinitialize hash to EMPTY_HASH after freeing There's several locations that free a ftrace hash pointer but may be referenced again. Reset them to EMPTY_HASH so that a u-a-f bug doesn't happen. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250417110933.20ab718b@gandalf.local.home Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 43394445390c..d0e4a902bb40 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -1297,6 +1297,8 @@ void ftrace_free_filter(struct ftrace_ops *ops) return; free_ftrace_hash(ops->func_hash->filter_hash); free_ftrace_hash(ops->func_hash->notrace_hash); + ops->func_hash->filter_hash = EMPTY_HASH; + ops->func_hash->notrace_hash = EMPTY_HASH; } EXPORT_SYMBOL_GPL(ftrace_free_filter); @@ -3443,6 +3445,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** size_bits); if (ret < 0) { free_ftrace_hash(*filter_hash); + *filter_hash = EMPTY_HASH; return ret; } } @@ -3472,6 +3475,7 @@ static int add_next_hash(struct ftrace_hash **filter_hash, struct ftrace_hash ** subops_hash->notrace_hash); if (ret < 0) { free_ftrace_hash(*notrace_hash); + *notrace_hash = EMPTY_HASH; return ret; } } -- cgit v1.2.3 From c45c585dde535e5ae2c363594bde3e05ce94a296 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 13:59:39 -0400 Subject: ftrace: Free ftrace hashes after they are replaced in the subops code The subops processing creates new hashes when adding and removing subops. There were some places that the old hashes that were replaced were not freed and this caused some memory leaks. Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Link: https://lore.kernel.org/20250417135939.245b128d@gandalf.local.home Fixes: 0ae6b8ce200d ("ftrace: Fix accounting of subop hashes") Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index d0e4a902bb40..41dcfcf8b40a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3609,6 +3609,9 @@ static int rebuild_hashes(struct ftrace_hash **filter_hash, struct ftrace_hash * } } + free_ftrace_hash(temp_hash.filter_hash); + free_ftrace_hash(temp_hash.notrace_hash); + temp_hash.filter_hash = *filter_hash; temp_hash.notrace_hash = *notrace_hash; } @@ -3703,8 +3706,11 @@ static int ftrace_hash_move_and_update_subops(struct ftrace_ops *subops, } ret = rebuild_hashes(&filter_hash, ¬race_hash, ops); - if (!ret) + if (!ret) { ret = ftrace_update_ops(ops, filter_hash, notrace_hash); + free_ftrace_hash(filter_hash); + free_ftrace_hash(notrace_hash); + } if (ret) { /* Put back the original hash */ -- cgit v1.2.3 From 92f1d3b40179b15630d72e2c6e4e25a899b67ba9 Mon Sep 17 00:00:00 2001 From: Menglong Dong Date: Sun, 13 Apr 2025 09:44:44 +0800 Subject: ftrace: fix incorrect hash size in register_ftrace_direct() The maximum of the ftrace hash bits is made fls(32) in register_ftrace_direct(), which seems illogical. So, we fix it by making the max hash bits FTRACE_HASH_MAX_BITS instead. Link: https://lore.kernel.org/20250413014444.36724-1-dongml2@chinatelecom.cn Fixes: d05cb470663a ("ftrace: Fix modification of direct_function hash while in use") Signed-off-by: Menglong Dong Acked-by: Masami Hiramatsu (Google) Signed-off-by: Steven Rostedt (Google) --- kernel/trace/ftrace.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 41dcfcf8b40a..61130bb34d6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -5964,9 +5964,10 @@ int register_ftrace_direct(struct ftrace_ops *ops, unsigned long addr) /* Make a copy hash to place the new and the old entries in */ size = hash->count + direct_functions->count; - if (size > 32) - size = 32; - new_hash = alloc_ftrace_hash(fls(size)); + size = fls(size); + if (size > FTRACE_HASH_MAX_BITS) + size = FTRACE_HASH_MAX_BITS; + new_hash = alloc_ftrace_hash(size); if (!new_hash) goto out_unlock; -- cgit v1.2.3 From 3b4e87e6a593d571183c414d81758624da01f2b9 Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Sun, 13 Apr 2025 00:10:43 +0200 Subject: ftrace: Fix type of ftrace_graph_ent_entry.depth ftrace_graph_ent.depth is int, but ftrace_graph_ent_entry.depth is unsigned long. This confuses trace-cmd on 64-bit big-endian systems and makes it print a huge amount of spaces. Fix this by using unsigned int, which has a matching size, instead. Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Sven Schnelle Cc: Heiko Carstens Cc: Vasily Gorbik Cc: Alexander Gordeev Link: https://lore.kernel.org/20250412221847.17310-2-iii@linux.ibm.com Fixes: ff5c9c576e75 ("ftrace: Add support for function argument to graph tracer") Signed-off-by: Ilya Leoshkevich Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_entries.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index ee40d4e6ad1c..4ef4df6623a8 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -80,11 +80,11 @@ FTRACE_ENTRY(funcgraph_entry, ftrace_graph_ent_entry, F_STRUCT( __field_struct( struct ftrace_graph_ent, graph_ent ) __field_packed( unsigned long, graph_ent, func ) - __field_packed( unsigned long, graph_ent, depth ) + __field_packed( unsigned int, graph_ent, depth ) __dynamic_array(unsigned long, args ) ), - F_printk("--> %ps (%lu)", (void *)__entry->func, __entry->depth) + F_printk("--> %ps (%u)", (void *)__entry->func, __entry->depth) ); #ifdef CONFIG_FUNCTION_GRAPH_RETADDR -- cgit v1.2.3 From 6405f5b70b1c240ffddef01c7a140498f47d4fe7 Mon Sep 17 00:00:00 2001 From: Lucas De Marchi Date: Wed, 9 Apr 2025 21:59:34 -0700 Subject: drm/xe: Set LRC addresses before guc load The metadata saved in the ADS is read by GuC when it's initialized. Saving the addresses to the LRCs when they are populated is too late as GuC will keep using the old ones. This was causing GuC to use the RCS LRC for any engine class. It's not a big problem on a Linux-only scenario since the they are used by GuC only on media engines when the watchdog is triggered. However, in a virtualization scenario with Windows as the VF, it causes the wrong LRCs to be loaded as the watchdog is used for all engines. Fix it by letting guc_golden_lrc_init() initialize the metadata, like other *_init() functions, and later guc_golden_lrc_populate() to copy the LRCs to the right places. The former is called before the second GuC load, while the latter is called after LRCs have been recorded. Cc: Chee Yin Wong Cc: John Harrison Cc: Matt Roper Cc: Matthew Brost Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Cc: # v6.11+ Reviewed-by: Matthew Brost Tested-by: Chee Yin Wong Link: https://lore.kernel.org/r/20250409-fix-guc-ads-v1-1-494135f7a5d0@intel.com Signed-off-by: Lucas De Marchi (cherry picked from commit c31a0b6402d15b530514eee9925adfcb8cfbb1c9) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_guc_ads.c | 75 ++++++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 30 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_guc_ads.c b/drivers/gpu/drm/xe/xe_guc_ads.c index e7c9e095a19f..7031542a70ce 100644 --- a/drivers/gpu/drm/xe/xe_guc_ads.c +++ b/drivers/gpu/drm/xe/xe_guc_ads.c @@ -490,24 +490,52 @@ static void fill_engine_enable_masks(struct xe_gt *gt, engine_enable_mask(gt, XE_ENGINE_CLASS_OTHER)); } -static void guc_prep_golden_lrc_null(struct xe_guc_ads *ads) +/* + * Write the offsets corresponding to the golden LRCs. The actual data is + * populated later by guc_golden_lrc_populate() + */ +static void guc_golden_lrc_init(struct xe_guc_ads *ads) { struct xe_device *xe = ads_to_xe(ads); + struct xe_gt *gt = ads_to_gt(ads); struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), offsetof(struct __guc_ads_blob, system_info)); - u8 guc_class; + size_t alloc_size, real_size; + u32 addr_ggtt, offset; + int class; + + offset = guc_ads_golden_lrc_offset(ads); + addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset; + + for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { + u8 guc_class; + + guc_class = xe_engine_class_to_guc_class(class); - for (guc_class = 0; guc_class <= GUC_MAX_ENGINE_CLASSES; ++guc_class) { if (!info_map_read(xe, &info_map, engine_enabled_masks[guc_class])) continue; + real_size = xe_gt_lrc_size(gt, class); + alloc_size = PAGE_ALIGN(real_size); + + /* + * This interface is slightly confusing. We need to pass the + * base address of the full golden context and the size of just + * the engine state, which is the section of the context image + * that starts after the execlists LRC registers. This is + * required to allow the GuC to restore just the engine state + * when a watchdog reset occurs. + * We calculate the engine state size by removing the size of + * what comes before it in the context image (which is identical + * on all engines). + */ ads_blob_write(ads, ads.eng_state_size[guc_class], - guc_ads_golden_lrc_size(ads) - - xe_lrc_skip_size(xe)); + real_size - xe_lrc_skip_size(xe)); ads_blob_write(ads, ads.golden_context_lrca[guc_class], - xe_bo_ggtt_addr(ads->bo) + - guc_ads_golden_lrc_offset(ads)); + addr_ggtt); + + addr_ggtt += alloc_size; } } @@ -857,7 +885,7 @@ void xe_guc_ads_populate_minimal(struct xe_guc_ads *ads) xe_map_memset(ads_to_xe(ads), ads_to_map(ads), 0, 0, ads->bo->size); guc_policies_init(ads); - guc_prep_golden_lrc_null(ads); + guc_golden_lrc_init(ads); guc_mapping_table_init_invalid(gt, &info_map); guc_doorbell_init(ads); @@ -883,7 +911,7 @@ void xe_guc_ads_populate(struct xe_guc_ads *ads) guc_policies_init(ads); fill_engine_enable_masks(gt, &info_map); guc_mmio_reg_state_init(ads); - guc_prep_golden_lrc_null(ads); + guc_golden_lrc_init(ads); guc_mapping_table_init(gt, &info_map); guc_capture_prep_lists(ads); guc_doorbell_init(ads); @@ -903,18 +931,22 @@ void xe_guc_ads_populate(struct xe_guc_ads *ads) guc_ads_private_data_offset(ads)); } -static void guc_populate_golden_lrc(struct xe_guc_ads *ads) +/* + * After the golden LRC's are recorded for each engine class by the first + * submission, copy them to the ADS, as initialized earlier by + * guc_golden_lrc_init(). + */ +static void guc_golden_lrc_populate(struct xe_guc_ads *ads) { struct xe_device *xe = ads_to_xe(ads); struct xe_gt *gt = ads_to_gt(ads); struct iosys_map info_map = IOSYS_MAP_INIT_OFFSET(ads_to_map(ads), offsetof(struct __guc_ads_blob, system_info)); size_t total_size = 0, alloc_size, real_size; - u32 addr_ggtt, offset; + u32 offset; int class; offset = guc_ads_golden_lrc_offset(ads); - addr_ggtt = xe_bo_ggtt_addr(ads->bo) + offset; for (class = 0; class < XE_ENGINE_CLASS_MAX; ++class) { u8 guc_class; @@ -931,26 +963,9 @@ static void guc_populate_golden_lrc(struct xe_guc_ads *ads) alloc_size = PAGE_ALIGN(real_size); total_size += alloc_size; - /* - * This interface is slightly confusing. We need to pass the - * base address of the full golden context and the size of just - * the engine state, which is the section of the context image - * that starts after the execlists LRC registers. This is - * required to allow the GuC to restore just the engine state - * when a watchdog reset occurs. - * We calculate the engine state size by removing the size of - * what comes before it in the context image (which is identical - * on all engines). - */ - ads_blob_write(ads, ads.eng_state_size[guc_class], - real_size - xe_lrc_skip_size(xe)); - ads_blob_write(ads, ads.golden_context_lrca[guc_class], - addr_ggtt); - xe_map_memcpy_to(xe, ads_to_map(ads), offset, gt->default_lrc[class], real_size); - addr_ggtt += alloc_size; offset += alloc_size; } @@ -959,7 +974,7 @@ static void guc_populate_golden_lrc(struct xe_guc_ads *ads) void xe_guc_ads_populate_post_load(struct xe_guc_ads *ads) { - guc_populate_golden_lrc(ads); + guc_golden_lrc_populate(ads); } static int guc_ads_action_update_policies(struct xe_guc_ads *ads, u32 policy_offset) -- cgit v1.2.3 From 2577b202458cddff85cc154b1fe7f313e0d1f418 Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Mon, 14 Apr 2025 14:25:40 +0100 Subject: drm/xe/userptr: fix notifier vs folio deadlock MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User is reporting what smells like notifier vs folio deadlock, where migrate_pages_batch() on core kernel side is holding folio lock(s) and then interacting with the mappings of it, however those mappings are tied to some userptr, which means calling into the notifier callback and grabbing the notifier lock. With perfect timing it looks possible that the pages we pulled from the hmm fault can get sniped by migrate_pages_batch() at the same time that we are holding the notifier lock to mark the pages as accessed/dirty, but at this point we also want to grab the folio locks(s) to mark them as dirty, but if they are contended from notifier/migrate_pages_batch side then we deadlock since folio lock won't be dropped until we drop the notifier lock. Fortunately the mark_page_accessed/dirty is not really needed in the first place it seems and should have already been done by hmm fault, so just remove it. Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4765 Fixes: 0a98219bcc96 ("drm/xe/hmm: Don't dereference struct page pointers without notifier lock") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Cc: # v6.10+ Reviewed-by: Thomas Hellström Reviewed-by: Matthew Brost Link: https://lore.kernel.org/r/20250414132539.26654-2-matthew.auld@intel.com (cherry picked from commit bd7c0cb695e87c0e43247be8196b4919edbe0e85) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_hmm.c | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_hmm.c b/drivers/gpu/drm/xe/xe_hmm.c index c3cc0fa105e8..57b71956ddf4 100644 --- a/drivers/gpu/drm/xe/xe_hmm.c +++ b/drivers/gpu/drm/xe/xe_hmm.c @@ -19,29 +19,6 @@ static u64 xe_npages_in_range(unsigned long start, unsigned long end) return (end - start) >> PAGE_SHIFT; } -/** - * xe_mark_range_accessed() - mark a range is accessed, so core mm - * have such information for memory eviction or write back to - * hard disk - * @range: the range to mark - * @write: if write to this range, we mark pages in this range - * as dirty - */ -static void xe_mark_range_accessed(struct hmm_range *range, bool write) -{ - struct page *page; - u64 i, npages; - - npages = xe_npages_in_range(range->start, range->end); - for (i = 0; i < npages; i++) { - page = hmm_pfn_to_page(range->hmm_pfns[i]); - if (write) - set_page_dirty_lock(page); - - mark_page_accessed(page); - } -} - static int xe_alloc_sg(struct xe_device *xe, struct sg_table *st, struct hmm_range *range, struct rw_semaphore *notifier_sem) { @@ -331,7 +308,6 @@ int xe_hmm_userptr_populate_range(struct xe_userptr_vma *uvma, if (ret) goto out_unlock; - xe_mark_range_accessed(&hmm_range, write); userptr->sg = &userptr->sgt; xe_hmm_userptr_set_mapped(uvma); userptr->notifier_seq = hmm_range.notifier_seq; -- cgit v1.2.3 From 25583ad42d091819157832e894179200ba8b54ee Mon Sep 17 00:00:00 2001 From: Matthew Auld Date: Thu, 10 Apr 2025 17:27:17 +0100 Subject: drm/xe/dma_buf: stop relying on placement in unmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The is_vram() is checking the current placement, however if we consider exported VRAM with dynamic dma-buf, it looks possible for the xe driver to async evict the memory, notifying the importer, however importer does not have to call unmap_attachment() immediately, but rather just as "soon as possible", like when the dma-resv idles. Following from this we would then pipeline the move, attaching the fence to the manager, and then update the current placement. But when the unmap_attachment() runs at some later point we might see that is_vram() is now false, and take the complete wrong path when dma-unmapping the sg, leading to explosions. To fix this check if the sgl was mapping a struct page. v2: - The attachment can be mapped multiple times it seems, so we can't really rely on encoding something in the attachment->priv. Instead see if the page_link has an encoded struct page. For vram we expect this to be NULL. Link: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4563 Fixes: dd08ebf6c352 ("drm/xe: Introduce a new DRM driver for Intel GPUs") Signed-off-by: Matthew Auld Cc: Thomas Hellström Cc: Matthew Brost Cc: # v6.8+ Acked-by: Christian König Link: https://lore.kernel.org/r/20250410162716.159403-2-matthew.auld@intel.com (cherry picked from commit d755887f8e5a2a18e15e6632a5193e5feea18499) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_dma_buf.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_dma_buf.c b/drivers/gpu/drm/xe/xe_dma_buf.c index f67803e15a0e..f7a20264ea33 100644 --- a/drivers/gpu/drm/xe/xe_dma_buf.c +++ b/drivers/gpu/drm/xe/xe_dma_buf.c @@ -145,10 +145,7 @@ static void xe_dma_buf_unmap(struct dma_buf_attachment *attach, struct sg_table *sgt, enum dma_data_direction dir) { - struct dma_buf *dma_buf = attach->dmabuf; - struct xe_bo *bo = gem_to_xe_bo(dma_buf->priv); - - if (!xe_bo_is_vram(bo)) { + if (sg_page(sgt->sgl)) { dma_unmap_sgtable(attach->dev, sgt, dir, 0); sg_free_table(sgt); kfree(sgt); -- cgit v1.2.3 From 78600df8f593407a3df2d6c48c35d0ad203d7fb4 Mon Sep 17 00:00:00 2001 From: Daniele Ceraolo Spurio Date: Wed, 16 Apr 2025 13:16:22 -0700 Subject: drm/xe/pxp: do not queue unneeded terminations from debugfs The PXP terminate debugfs currently unconditionally simulates a termination, no matter what the HW status is. This is unneeded if PXP is not in use and can cause errors if the HW init hasn't completed yet. To solve these issues, we can simply limit the terminations to the cases where PXP is fully initialized and in use. v2: s/pxp_status/ready/ to avoid confusion with pxp->status (John) Fixes: 385a8015b214 ("drm/xe/pxp: Add PXP debugfs support") Closes: https://gitlab.freedesktop.org/drm/xe/kernel/-/issues/4749 Signed-off-by: Daniele Ceraolo Spurio Cc: John Harrison Reviewed-by: John Harrison Link: https://lore.kernel.org/r/20250416201622.1295369-1-daniele.ceraolospurio@intel.com (cherry picked from commit ba1f62a0cac84757ca35f4217e3cd3a2654233ae) Signed-off-by: Lucas De Marchi --- drivers/gpu/drm/xe/xe_pxp_debugfs.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/xe/xe_pxp_debugfs.c b/drivers/gpu/drm/xe/xe_pxp_debugfs.c index ccfbacf08efc..525a2f6bb076 100644 --- a/drivers/gpu/drm/xe/xe_pxp_debugfs.c +++ b/drivers/gpu/drm/xe/xe_pxp_debugfs.c @@ -66,9 +66,18 @@ static int pxp_terminate(struct seq_file *m, void *data) { struct xe_pxp *pxp = node_to_pxp(m->private); struct drm_printer p = drm_seq_file_printer(m); + int ready = xe_pxp_get_readiness_status(pxp); - if (!xe_pxp_is_enabled(pxp)) - return -ENODEV; + if (ready < 0) + return ready; /* disabled or error occurred */ + else if (!ready) + return -EBUSY; /* init still in progress */ + + /* no need for a termination if PXP is not active */ + if (pxp->status != XE_PXP_ACTIVE) { + drm_printf(&p, "PXP not active\n"); + return 0; + } /* simulate a termination interrupt */ spin_lock_irq(&pxp->xe->irq.lock); -- cgit v1.2.3 From 750d0ac001e85b754404178ee8ce01cbc76a03be Mon Sep 17 00:00:00 2001 From: Maxime Chevallier Date: Wed, 16 Apr 2025 14:54:48 +0200 Subject: MAINTAINERS: Add entry for Socfpga DWMAC ethernet glue driver Socfpga's DWMAC glue comes in a variety of flavours with multiple options when it comes to physical interfaces, making it not so easy to test. Having access to a Cyclone5 with RGMII as well as Lynx PCS variants, add myself as a maintainer to help with reviews and testing. Signed-off-by: Maxime Chevallier Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250416125453.306029-1-maxime.chevallier@bootlin.com Signed-off-by: Jakub Kicinski --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 8c7d796131a8..067f72e8af10 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3191,6 +3191,12 @@ M: Dinh Nguyen S: Maintained F: drivers/clk/socfpga/ +ARM/SOCFPGA DWMAC GLUE LAYER +M: Maxime Chevallier +S: Maintained +F: Documentation/devicetree/bindings/net/socfpga-dwmac.txt +F: drivers/net/ethernet/stmicro/stmmac/dwmac-socfpga.c + ARM/SOCFPGA EDAC BINDINGS M: Matthew Gerlach S: Maintained -- cgit v1.2.3 From a8c5b0ed89a3f2c81c6ae0b041394e6eea0e7024 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 17 Apr 2025 18:30:03 -0400 Subject: tracing: Fix filter string testing The filter string testing uses strncpy_from_kernel/user_nofault() to retrieve the string to test the filter against. The if() statement was incorrect as it considered 0 as a fault, when it is only negative that it faulted. Running the following commands: # cd /sys/kernel/tracing # echo "filename.ustring ~ \"/proc*\"" > events/syscalls/sys_enter_openat/filter # echo 1 > events/syscalls/sys_enter_openat/enable # ls /proc/$$/maps # cat trace Would produce nothing, but with the fix it will produce something like: ls-1192 [007] ..... 8169.828333: sys_openat(dfd: ffffffffffffff9c, filename: 7efc18359904, flags: 80000, mode: 0) Link: https://lore.kernel.org/all/CAEf4BzbVPQ=BjWztmEwBPRKHUwNfKBkS3kce-Rzka6zvbQeVpg@mail.gmail.com/ Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andrew Morton Link: https://lore.kernel.org/20250417183003.505835fb@gandalf.local.home Fixes: 77360f9bbc7e5 ("tracing: Add test for user space strings when filtering on string pointers") Reported-by: Andrii Nakryiko Reported-by: Mykyta Yatsenko Signed-off-by: Steven Rostedt (Google) --- kernel/trace/trace_events_filter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 0993dfc1c5c1..2048560264bb 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -808,7 +808,7 @@ static __always_inline char *test_string(char *str) kstr = ubuf->buffer; /* For safety, do not trust the string pointer */ - if (!strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE)) + if (strncpy_from_kernel_nofault(kstr, str, USTRING_BUF_SIZE) < 0) return NULL; return kstr; } @@ -827,7 +827,7 @@ static __always_inline char *test_ustring(char *str) /* user space address? */ ustr = (char __user *)str; - if (!strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE)) + if (strncpy_from_user_nofault(kstr, ustr, USTRING_BUF_SIZE) < 0) return NULL; return kstr; -- cgit v1.2.3 From 4067196a52278156d18d8d6fa7f43970611b1b49 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Sat, 29 Mar 2025 19:10:29 +0200 Subject: mm/page_alloc: fix deadlock on cpu_hotplug_lock in __accept_page() When the last page in the zone is accepted, __accept_page() calls static_branch_dec(). This function takes cpu_hotplug_lock, which can lead to a deadlock if the allocation occurs during CPU bringup path as _cpu_up() also takes the lock. To prevent this deadlock, defer static_branch_dec() to a workqueue. Call static_branch_dec() only when the workqueue is not yet initialized. Workqueues are initialized before CPU bring up, so this will not conflict with the first scenario. Link: https://lkml.kernel.org/r/20250329171030.3942298-1-kirill.shutemov@linux.intel.com Fixes: 55ad43e8ba0f ("mm: add a helper to accept page") Signed-off-by: Kirill A. Shutemov Reported-by: Srikanth Aithal Tested-by: Srikanth Aithal Cc: Dave Hansen Cc: Ashish Kalra Cc: David Hildenbrand Cc: "Edgecombe, Rick P" Cc: Mel Gorman Cc: "Mike Rapoport (IBM)" Cc: Thomas Lendacky Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 3 +++ mm/internal.h | 1 + mm/mm_init.c | 1 + mm/page_alloc.c | 28 ++++++++++++++++++++++++++-- 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 25e80b2ca7f4..4c95fcc9e9df 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -967,6 +967,9 @@ struct zone { #ifdef CONFIG_UNACCEPTED_MEMORY /* Pages to be accepted. All pages on the list are MAX_PAGE_ORDER */ struct list_head unaccepted_pages; + + /* To be called once the last page in the zone is accepted */ + struct work_struct unaccepted_cleanup; #endif /* zone flags, see below */ diff --git a/mm/internal.h b/mm/internal.h index 50c2f590b2d0..e9695baa5922 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1595,6 +1595,7 @@ unsigned long move_page_tables(struct pagetable_move_control *pmc); #ifdef CONFIG_UNACCEPTED_MEMORY void accept_page(struct page *page); +void unaccepted_cleanup_work(struct work_struct *work); #else /* CONFIG_UNACCEPTED_MEMORY */ static inline void accept_page(struct page *page) { diff --git a/mm/mm_init.c b/mm/mm_init.c index 84f14fa12d0d..9659689b8ace 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -1441,6 +1441,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) #ifdef CONFIG_UNACCEPTED_MEMORY INIT_LIST_HEAD(&zone->unaccepted_pages); + INIT_WORK(&zone->unaccepted_cleanup, unaccepted_cleanup_work); #endif } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1715e34b91af..e506e365d6f1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7191,6 +7191,11 @@ static DEFINE_STATIC_KEY_FALSE(zones_with_unaccepted_pages); static bool lazy_accept = true; +void unaccepted_cleanup_work(struct work_struct *work) +{ + static_branch_dec(&zones_with_unaccepted_pages); +} + static int __init accept_memory_parse(char *p) { if (!strcmp(p, "lazy")) { @@ -7229,8 +7234,27 @@ static void __accept_page(struct zone *zone, unsigned long *flags, __free_pages_ok(page, MAX_PAGE_ORDER, FPI_TO_TAIL); - if (last) - static_branch_dec(&zones_with_unaccepted_pages); + if (last) { + /* + * There are two corner cases: + * + * - If allocation occurs during the CPU bring up, + * static_branch_dec() cannot be used directly as + * it causes a deadlock on cpu_hotplug_lock. + * + * Instead, use schedule_work() to prevent deadlock. + * + * - If allocation occurs before workqueues are initialized, + * static_branch_dec() should be called directly. + * + * Workqueues are initialized before CPU bring up, so this + * will not conflict with the first scenario. + */ + if (system_wq) + schedule_work(&zone->unaccepted_cleanup); + else + unaccepted_cleanup_work(&zone->unaccepted_cleanup); + } } void accept_page(struct page *page) -- cgit v1.2.3 From 98b1917cdef92c29fc9a14060d5606c619050c2c Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 10 Apr 2025 11:10:20 +0200 Subject: fs/dax: fix folio splitting issue by resetting old folio order + _nr_pages Alison reports an issue with fsdax when large extends end up using large ZONE_DEVICE folios: [ 417.796271] BUG: kernel NULL pointer dereference, address: 0000000000000b00 [ 417.796982] #PF: supervisor read access in kernel mode [ 417.797540] #PF: error_code(0x0000) - not-present page [ 417.798123] PGD 2a5c5067 P4D 2a5c5067 PUD 2a5c6067 PMD 0 [ 417.798690] Oops: Oops: 0000 [#1] SMP NOPTI [ 417.799178] CPU: 5 UID: 0 PID: 1515 Comm: mmap Tainted: ... [ 417.800150] Tainted: [O]=OOT_MODULE [ 417.800583] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 0.0.0 02/06/2015 [ 417.801358] RIP: 0010:__lruvec_stat_mod_folio+0x7e/0x250 [ 417.801948] Code: ... [ 417.803662] RSP: 0000:ffffc90002be3a08 EFLAGS: 00010206 [ 417.804234] RAX: 0000000000000000 RBX: 0000000000000200 RCX: 0000000000000002 [ 417.804984] RDX: ffffffff815652d7 RSI: 0000000000000000 RDI: ffffffff82a2beae [ 417.805689] RBP: ffffc90002be3a28 R08: 0000000000000000 R09: 0000000000000000 [ 417.806384] R10: ffffea0007000040 R11: ffff888376ffe000 R12: 0000000000000001 [ 417.807099] R13: 0000000000000012 R14: ffff88807fe4ab40 R15: ffff888029210580 [ 417.807801] FS: 00007f339fa7a740(0000) GS:ffff8881fa9b9000(0000) knlGS:0000000000000000 [ 417.808570] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 417.809193] CR2: 0000000000000b00 CR3: 000000002a4f0004 CR4: 0000000000370ef0 [ 417.809925] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 417.810622] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 417.811353] Call Trace: [ 417.811709] [ 417.812038] folio_add_file_rmap_ptes+0x143/0x230 [ 417.812566] insert_page_into_pte_locked+0x1ee/0x3c0 [ 417.813132] insert_page+0x78/0xf0 [ 417.813558] vmf_insert_page_mkwrite+0x55/0xa0 [ 417.814088] dax_fault_iter+0x484/0x7b0 [ 417.814542] dax_iomap_pte_fault+0x1ca/0x620 [ 417.815055] dax_iomap_fault+0x39/0x40 [ 417.815499] __xfs_write_fault+0x139/0x380 [ 417.815995] ? __handle_mm_fault+0x5e5/0x1a60 [ 417.816483] xfs_write_fault+0x41/0x50 [ 417.816966] xfs_filemap_fault+0x3b/0xe0 [ 417.817424] __do_fault+0x31/0x180 [ 417.817859] __handle_mm_fault+0xee1/0x1a60 [ 417.818325] ? debug_smp_processor_id+0x17/0x20 [ 417.818844] handle_mm_fault+0xe1/0x2b0 [...] The issue is that when we split a large ZONE_DEVICE folio to order-0 ones, we don't reset the order/_nr_pages. As folio->_nr_pages overlays page[1]->memcg_data, once page[1] is a folio, it suddenly looks like it has folio->memcg_data set. And we never manually initialize folio->memcg_data in fsdax code, because we never expect it to be set at all. When __lruvec_stat_mod_folio() then stumbles over such a folio, it tries to use folio->memcg_data (because it's non-NULL) but it does not actually point at a memcg, resulting in the problem. Alison also observed that these folios sometimes have "locked" set, which is rather concerning (folios locked from the beginning ...). The reason is that the order for large folios is stored in page[1]->flags, which become the folio->flags of a new small folio. Let's fix it by adding a folio helper to clear order/_nr_pages for splitting purposes. Maybe we should reinitialize other large folio flags / folio members as well when splitting, because they might similarly cause harm once page[1] becomes a folio? At least other flags in PAGE_FLAGS_SECOND should not be set for fsdax, so at least page[1]->flags might be as expected with this fix. From a quick glimpse, initializing ->mapping, ->pgmap and ->share should re-initialize most things from a previous page[1] used by large folios that fsdax cares about. For example folio->private might not get reinitialized, but maybe that's not relevant -- no traces of it's use in fsdax code. Needs a closer look. Another thing that should be considered in the future is performing similar checks as we perform in free_tail_page_prepare() -- checking pincount etc. -- when freeing a large fsdax folio. Link: https://lkml.kernel.org/r/20250410091020.119116-1-david@redhat.com Fixes: 4996fc547f5b ("mm: let _folio_nr_pages overlay memcg_data in first tail page") Fixes: 38607c62b34b ("fs/dax: properly refcount fs dax pages") Signed-off-by: David Hildenbrand Reported-by: Alison Schofield Closes: https://lkml.kernel.org/r/Z_W9Oeg-D9FhImf3@aschofie-mobl2.lan Tested-by: Alison Schofield Reviewed-by: Dan Williams Tested-by: "Darrick J. Wong" Cc: Alexander Viro Cc: Christian Brauner Cc: Jan Kara Cc: Matthew Wilcox Cc: Alistair Popple Cc: Christoph Hellwig Signed-off-by: Andrew Morton --- fs/dax.c | 1 + include/linux/mm.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index af5045b0f476..676303419e9e 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -396,6 +396,7 @@ static inline unsigned long dax_folio_put(struct folio *folio) order = folio_order(folio); if (!order) return 0; + folio_reset_order(folio); for (i = 0; i < (1UL << order); i++) { struct dev_pagemap *pgmap = page_pgmap(&folio->page); diff --git a/include/linux/mm.h b/include/linux/mm.h index b7f13f087954..bf55206935c4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1218,6 +1218,23 @@ static inline unsigned int folio_order(const struct folio *folio) return folio_large_order(folio); } +/** + * folio_reset_order - Reset the folio order and derived _nr_pages + * @folio: The folio. + * + * Reset the order and derived _nr_pages to 0. Must only be used in the + * process of splitting large folios. + */ +static inline void folio_reset_order(struct folio *folio) +{ + if (WARN_ON_ONCE(!folio_test_large(folio))) + return; + folio->_flags_1 &= ~0xffUL; +#ifdef NR_PAGES_IN_LARGE_FOLIO + folio->_nr_pages = 0; +#endif +} + #include /* -- cgit v1.2.3 From 8ad5ac8f4fc4848d17db809038773ee0bee76b0b Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 10 Apr 2025 11:00:22 +0200 Subject: MAINTAINERS: update SLAB ALLOCATOR maintainers With permission, reduce the number of maintainers. Create a CREDITS entry for Joonsoo (Pekka already has one). Thanks for all the work! Link: https://lkml.kernel.org/r/20250410090021.72296-3-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Harry Yoo Acked-by: Christoph Lameter (Ampere) Acked-by: David Rientjes Cc: Pekka Enberg Cc: Joonsoo Kim Cc: Roman Gushchin Cc: Brendan Jackman Cc: Johannes Weiner Cc: Michal Hocko Cc: Suren Baghdasaryan Cc: Zi Yan Signed-off-by: Andrew Morton --- CREDITS | 4 ++++ MAINTAINERS | 2 -- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CREDITS b/CREDITS index 1b77fba6c27e..f74d230992d6 100644 --- a/CREDITS +++ b/CREDITS @@ -2071,6 +2071,10 @@ S: 660 Harvard Ave. #7 S: Santa Clara, CA 95051 S: USA +N: Joonsoo Kim +E: iamjoonsoo.kim@lge.com +D: Slab allocators + N: Kukjin Kim E: kgene@kernel.org D: Samsung S3C, S5P and Exynos ARM architectures diff --git a/MAINTAINERS b/MAINTAINERS index 8c7d796131a8..16c9e10622df 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -22249,9 +22249,7 @@ F: drivers/nvmem/layouts/sl28vpd.c SLAB ALLOCATOR M: Christoph Lameter -M: Pekka Enberg M: David Rientjes -M: Joonsoo Kim M: Andrew Morton M: Vlastimil Babka R: Roman Gushchin -- cgit v1.2.3 From 5e610c8c09990dc4bfdfdc56138838a41a718967 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 10 Apr 2025 11:00:23 +0200 Subject: MAINTAINERS: add MM subsection for the page allocator Add a subsection for the page allocator, including compaction as it's crucial for high-order allocations and works together with the anti-fragmentation features. Add reviewers (including myself) who voluteered. Link: https://lkml.kernel.org/r/20250410090021.72296-4-vbabka@suse.cz Signed-off-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Zi Yan Acked-by: Brendan Jackman Acked-by: Johannes Weiner Cc: Suren Baghdasaryan Cc: Christoph Lameter (Ampere) Cc: David Rientjes Cc: Harry Yoo Cc: Joonsoo Kim Cc: Pekka Enberg Cc: Roman Gushchin Signed-off-by: Andrew Morton --- MAINTAINERS | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 16c9e10622df..2f7da8cc290a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15516,6 +15516,21 @@ F: mm/numa.c F: mm/numa_emulation.c F: mm/numa_memblks.c +MEMORY MANAGEMENT - PAGE ALLOCATOR +M: Andrew Morton +R: Vlastimil Babka +R: Suren Baghdasaryan +R: Michal Hocko +R: Brendan Jackman +R: Johannes Weiner +R: Zi Yan +L: linux-mm@kvack.org +S: Maintained +F: mm/compaction.c +F: mm/page_alloc.c +F: include/linux/gfp.h +F: include/linux/compaction.h + MEMORY MANAGEMENT - SECRETMEM M: Andrew Morton M: Mike Rapoport -- cgit v1.2.3 From 6b956934ad6d9f76c66c8fab570b07536c6ca472 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Thu, 10 Apr 2025 16:18:12 +0800 Subject: mm: memcontrol: fix swap counter leak from offline cgroup commit 73f839b6d2ed addressed an issue regarding the swap counter leak that occurred from an offline cgroup. However, commit 89ce924f0bd4 modified the parameter from @swap_memcg to @memcg (presumably this alteration was introduced while resolving conflicts). Fix this problem by reverting this minor change. Link: https://lkml.kernel.org/r/20250410081812.10073-1-songmuchun@bytedance.com Fixes: 89ce924f0bd4 ("mm: memcontrol: move memsw charge callbacks to v1") Signed-off-by: Muchun Song Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Michal Hocko Signed-off-by: Andrew Morton --- mm/memcontrol-v1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 8660908850dc..4a9cf27a70af 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -620,7 +620,7 @@ void memcg1_swapout(struct folio *folio, swp_entry_t entry) mem_cgroup_id_get_many(swap_memcg, nr_entries - 1); mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); - swap_cgroup_record(folio, mem_cgroup_id(memcg), entry); + swap_cgroup_record(folio, mem_cgroup_id(swap_memcg), entry); folio_unqueue_deferred_split(folio); folio->memcg_data = 0; -- cgit v1.2.3 From 1413efdb254f41c05ae5c13aa975ecd9733f37a7 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 11 Apr 2025 13:33:28 -0400 Subject: MAINTAINERS: add mmap trace events to MEMORY MAPPING MEMORY MAPPING does not list the mmap.h trace point file, but does list the mmap.c file. Couple the trace points with the users and authors of the trace points for notifications of updates. Link: https://lkml.kernel.org/r/20250411173328.8172-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: SeongJae Park Acked-by: Steven Rostedt (Google) Acked-by: Vlastimil Babka Cc: Lorenzo Stoakes Cc: Jann Horn Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2f7da8cc290a..78006958e7c7 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15562,6 +15562,7 @@ L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/trace/events/mmap.h F: mm/mlock.c F: mm/mmap.c F: mm/mprotect.c -- cgit v1.2.3 From 86fba6127e197c7d646e8ee771df6026e14211dc Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 11 Apr 2025 08:27:24 +0100 Subject: MAINTAINERS: add memory advice section The madvise code straddles both VMA and page table manipulation. As a result, separate it out into its own section and add maintainers/reviewers as appropriate. We additionally include the mman-common.h file as this contains the shared madvise flags and it is important we maintain this alongside madvise.c. Link: https://lkml.kernel.org/r/20250411072724.10841-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Liam R. Howlett Acked-by: Vlastimil Babka Acked-by: Jann Horn Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- MAINTAINERS | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 78006958e7c7..aaeb4d7113c3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15573,6 +15573,20 @@ F: mm/vma.h F: mm/vma_internal.h F: tools/testing/vma/ +MEMORY MAPPING - MADVISE (MEMORY ADVICE) +M: Andrew Morton +M: Liam R. Howlett +M: Lorenzo Stoakes +M: David Hildenbrand +R: Vlastimil Babka +R: Jann Horn +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: include/uapi/asm-generic/mman-common.h +F: mm/madvise.c + MEMORY TECHNOLOGY DEVICES (MTD) M: Miquel Raynal M: Richard Weinberger -- cgit v1.2.3 From 8c03ebd7cdc06bd0d2fecb4d1a609ef1dbb7d0aa Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 10 Apr 2025 11:57:14 +0800 Subject: mm/gup: fix wrongly calculated returned value in fault_in_safe_writeable() Not like fault_in_readable() or fault_in_writeable(), in fault_in_safe_writeable() local variable 'start' is increased page by page to loop till the whole address range is handled. However, it mistakenly calculates the size of the handled range with 'uaddr - start'. Fix it here. Andreas said: : In gfs2, fault_in_iov_iter_writeable() is used in : gfs2_file_direct_read() and gfs2_file_read_iter(), so this potentially : affects buffered as well as direct reads. This bug could cause those : gfs2 functions to spin in a loop. Link: https://lkml.kernel.org/r/20250410035717.473207-1-bhe@redhat.com Link: https://lkml.kernel.org/r/20250410035717.473207-2-bhe@redhat.com Signed-off-by: Baoquan He Fixes: fe673d3f5bf1 ("mm: gup: make fault_in_safe_writeable() use fixup_user_fault()") Reviewed-by: Oscar Salvador Acked-by: David Hildenbrand Cc: Andreas Gruenbacher Cc: Yanjun.Zhu Cc: Signed-off-by: Andrew Morton --- mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 92351e2fa876..84461d384ae2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2207,8 +2207,8 @@ size_t fault_in_safe_writeable(const char __user *uaddr, size_t size) } while (start != end); mmap_read_unlock(mm); - if (size > (unsigned long)uaddr - start) - return size - ((unsigned long)uaddr - start); + if (size > start - (unsigned long)uaddr) + return size - (start - (unsigned long)uaddr); return 0; } EXPORT_SYMBOL(fault_in_safe_writeable); -- cgit v1.2.3 From fd0ad5e9d158436b4a9b34c60582488585e1d90d Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Mon, 14 Apr 2025 09:35:31 +0200 Subject: docs: ABI: replace mcroce@microsoft.com with new Meta address The Microsoft email address is bouncing: 550 5.4.1 Recipient address rejected: Access denied. So let's replace it with Matteo's current mail address. Link: https://lkml.kernel.org/r/20250414-fix-mcroce-mail-bounce-v3-1-0aed2d71f3d7@pengutronix.de Signed-off-by: Ahmad Fatoum Acked-by: Matteo Croce Link: https://lore.kernel.org/all/BYAPR15MB2504E4B02DFFB1E55871955DA1062@BYAPR15MB2504.namprd15.prod.outlook.com/ Cc: Daniel Lezcano Cc: Jens Axboe Cc: Matteo Croce Cc: Sascha Hauer Signed-off-by: Andrew Morton --- Documentation/ABI/stable/sysfs-block | 2 +- Documentation/ABI/testing/sysfs-kernel-reboot | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/ABI/stable/sysfs-block b/Documentation/ABI/stable/sysfs-block index 3879963f0f01..11545c9e2e93 100644 --- a/Documentation/ABI/stable/sysfs-block +++ b/Documentation/ABI/stable/sysfs-block @@ -77,7 +77,7 @@ Description: What: /sys/block//diskseq Date: February 2021 -Contact: Matteo Croce +Contact: Matteo Croce Description: The /sys/block//diskseq files reports the disk sequence number, which is a monotonically increasing diff --git a/Documentation/ABI/testing/sysfs-kernel-reboot b/Documentation/ABI/testing/sysfs-kernel-reboot index e117aba46be0..52571fd5ddba 100644 --- a/Documentation/ABI/testing/sysfs-kernel-reboot +++ b/Documentation/ABI/testing/sysfs-kernel-reboot @@ -1,7 +1,7 @@ What: /sys/kernel/reboot Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Interface to set the kernel reboot behavior, similarly to what can be done via the reboot= cmdline option. (see Documentation/admin-guide/kernel-parameters.txt) @@ -9,25 +9,25 @@ Description: Interface to set the kernel reboot behavior, similarly to What: /sys/kernel/reboot/mode Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Reboot mode. Valid values are: cold warm hard soft gpio What: /sys/kernel/reboot/type Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Reboot type. Valid values are: bios acpi kbd triple efi pci What: /sys/kernel/reboot/cpu Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: CPU number to use to reboot. What: /sys/kernel/reboot/force Date: November 2020 KernelVersion: 5.11 -Contact: Matteo Croce +Contact: Matteo Croce Description: Don't wait for any other CPUs on reboot and avoid anything that could hang. -- cgit v1.2.3 From 9e888998ea4d22257b07ce911576509486fa0667 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Sat, 12 Apr 2025 18:39:12 +0200 Subject: writeback: fix false warning in inode_to_wb() inode_to_wb() is used also for filesystems that don't support cgroup writeback. For these filesystems inode->i_wb is stable during the lifetime of the inode (it points to bdi->wb) and there's no need to hold locks protecting the inode->i_wb dereference. Improve the warning in inode_to_wb() to not trigger for these filesystems. Link: https://lkml.kernel.org/r/20250412163914.3773459-3-agruenba@redhat.com Fixes: aaa2cacf8184 ("writeback: add lockdep annotation to inode_to_wb()") Signed-off-by: Jan Kara Signed-off-by: Andreas Gruenbacher Reviewed-by: Andreas Gruenbacher Cc: Signed-off-by: Andrew Morton --- include/linux/backing-dev.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 8e7af9a03b41..e721148c95d0 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -249,6 +249,7 @@ static inline struct bdi_writeback *inode_to_wb(const struct inode *inode) { #ifdef CONFIG_LOCKDEP WARN_ON_ONCE(debug_locks && + (inode->i_sb->s_iflags & SB_I_CGROUPWB) && (!lockdep_is_held(&inode->i_lock) && !lockdep_is_held(&inode->i_mapping->i_pages.xa_lock) && !lockdep_is_held(&inode->i_wb->list_lock))); -- cgit v1.2.3 From 274fe92de2c4e50dbfd1b30070b4f6d8a27b388a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 15 Apr 2025 13:18:59 +0200 Subject: mm, hugetlb: increment the number of pages to be reset on HVO commit 4eeec8c89a0c ("mm: move hugetlb specific things in folio to page[3]") shifted hugetlb specific stuff, and now mapping overlaps _hugetlb_cgroup field. Upon restoring the vmemmap for HVO, only the first two tail pages are reset, and this causes the check in free_tail_page_prepare() to fail as it finds an unexpected mapping value in some tails. Increment the number of pages to be reset to 4 (head + 3 tail pages) Link: https://lkml.kernel.org/r/20250415111859.376302-1-osalvador@suse.de Fixes: 4eeec8c89a0c ("mm: move hugetlb specific things in folio to page[3]") Signed-off-by: Oscar Salvador Suggested-by: David Hildenbrand Reviewed-by: David Hildenbrand Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 9a99dfa3c495..27245e86df25 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -238,11 +238,11 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, * struct page, the special metadata (e.g. page->flags or page->mapping) * cannot copy to the tail struct page structs. The invalid value will be * checked in the free_tail_page_prepare(). In order to avoid the message - * of "corrupted mapping in tail page". We need to reset at least 3 (one - * head struct page struct and two tail struct page structs) struct page + * of "corrupted mapping in tail page". We need to reset at least 4 (one + * head struct page struct and three tail struct page structs) struct page * structs. */ -#define NR_RESET_STRUCT_PAGE 3 +#define NR_RESET_STRUCT_PAGE 4 static inline void reset_struct_pages(struct page *start) { -- cgit v1.2.3 From 8bdea2fce98033d392db16da246843cadeddef39 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 15 Apr 2025 11:50:07 +0200 Subject: mm/memory: move sanity checks in do_wp_page() after mapcount vs. refcount stabilization In __folio_remove_rmap() for RMAP_LEVEL_PMD/RMAP_LEVEL_PUD and with CONFIG_PAGE_MAPCOUNT we first decrement the folio mapcount (and recompute mapped shared vs. mapped exclusively) to then adjust the entire mapcount. This means that another process might stumble in do_wp_page() over a PTE-mapped PMD folio that is indicated as "exclusively mapped", but still has an entire mapcount (PMD mapping), because it is racing with the process that is unmapping the folio (PMD mapping). Note that do_wp_page() will back off once it detects the remaining folio reference from the process that is in the process of unmapping the folio. This will trigger the early VM_WARN_ON_ONCE(folio_entire_mapcount(folio)) check in do_wp_page(), that can easily be reproduced by looping a couple of times over allocating a PMD THP, forking a child where we immediately unmap it again, and writing in the parent concurrently to the THP. [ 252.738129][T16470] ------------[ cut here ]------------ [ 252.739267][T16470] WARNING: CPU: 3 PID: 16470 at mm/memory.c:3738 do_wp_page+0x2a75/0x2c00 [ 252.740968][T16470] Modules linked in: [ 252.741958][T16470] CPU: 3 UID: 0 PID: 16470 Comm: ... ... [ 252.765841][T16470] [ 252.766419][T16470] ? srso_alias_return_thunk+0x5/0xfbef5 [ 252.767558][T16470] ? rcu_is_watching+0x12/0x60 [ 252.768525][T16470] ? srso_alias_return_thunk+0x5/0xfbef5 [ 252.769645][T16470] ? srso_alias_return_thunk+0x5/0xfbef5 [ 252.770778][T16470] ? lock_acquire+0x33/0x80 [ 252.771697][T16470] ? __handle_mm_fault+0x5e8/0x3e40 [ 252.772735][T16470] ? __handle_mm_fault+0x5e8/0x3e40 [ 252.773781][T16470] __handle_mm_fault+0x1869/0x3e40 [ 252.774839][T16470] handle_mm_fault+0x22a/0x640 [ 252.775808][T16470] do_user_addr_fault+0x618/0x1000 [ 252.776847][T16470] exc_page_fault+0x68/0xd0 [ 252.777775][T16470] asm_exc_page_fault+0x26/0x30 While we could adjust the sequence in __folio_remove_rmap(), let's rater move the mapcount sanity checks after the mapcount vs. refcount stabilization phase. With this fix, a simple reproducer is happy. While at it, convert the two VM_WARN_ON_ONCE() we are moving to VM_WARN_ON_ONCE_FOLIO(). Link: https://lkml.kernel.org/r/20250415095007.569836-1-david@redhat.com Fixes: 1da190f4d0a6 ("mm: Copy-on-Write (COW) reuse support for PTE-mapped THP") Signed-off-by: David Hildenbrand Reported-by: syzbot+5e8feb543ca8e12e0ede@syzkaller.appspotmail.com Closes: https://lkml.kernel.org/r/67fab4fe.050a0220.2c5fcf.0011.GAE@google.com Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 44481fe7c629..ba3ea0a82f7f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3734,8 +3734,6 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, return false; VM_WARN_ON_ONCE(folio_test_ksm(folio)); - VM_WARN_ON_ONCE(folio_mapcount(folio) > folio_nr_pages(folio)); - VM_WARN_ON_ONCE(folio_entire_mapcount(folio)); if (unlikely(folio_test_swapcache(folio))) { /* @@ -3760,6 +3758,8 @@ static bool __wp_can_reuse_large_anon_folio(struct folio *folio, if (folio_large_mapcount(folio) != folio_ref_count(folio)) goto unlock; + VM_WARN_ON_ONCE_FOLIO(folio_large_mapcount(folio) > folio_nr_pages(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_entire_mapcount(folio), folio); VM_WARN_ON_ONCE(folio_mm_id(folio, 0) != vma->vm_mm->mm_id && folio_mm_id(folio, 1) != vma->vm_mm->mm_id); -- cgit v1.2.3 From 2db93a896fec7109302598cf45de3831340d9f53 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 16 Apr 2025 14:53:01 +0100 Subject: MAINTAINERS: add Pedro as reviewer to the MEMORY MAPPING section Pedro has offered to review memory mapping code. He has good experience in this area and has provided excellent feedback on memory mapping series in the past so I feel he'll be a great addition. Link: https://lkml.kernel.org/r/20250416135301.43513-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Acked-by: Pedro Falcato Acked-by: Liam R. Howlett Cc: Jann Horn Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index aaeb4d7113c3..40e1999fd21b 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15558,6 +15558,7 @@ M: Liam R. Howlett M: Lorenzo Stoakes R: Vlastimil Babka R: Jann Horn +R: Pedro Falcato L: linux-mm@kvack.org S: Maintained W: http://www.linux-mm.org -- cgit v1.2.3 From 38448181459e24257b40d5258afdbaa3565e8cfc Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Apr 2025 09:45:39 -0400 Subject: mm: vmscan: restore high-cpu watermark safety in kswapd Vlastimil points out that commit a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") switched kswapd from zone_watermark_ok_safe() to the standard, percpu-cached version of reading free pages, thus dropping the watermark safety precautions for systems with high CPU counts (e.g. >212 cpus on 64G). Restore them. Since zone_watermark_ok_safe() is no longer the right interface, and this was the last caller of the function anyway, open-code the zone_page_state_snapshot() conditional and delete the function. Link: https://lkml.kernel.org/r/20250416135142.778933-2-hannes@cmpxchg.org Fixes: a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") Signed-off-by: Johannes Weiner Reported-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 2 -- mm/page_alloc.c | 12 ------------ mm/vmscan.c | 21 +++++++++++++++++++-- 3 files changed, 19 insertions(+), 16 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 4c95fcc9e9df..6ccec1bf2896 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1502,8 +1502,6 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int highest_zoneidx, unsigned int alloc_flags); -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx); /* * Memory initialization context, use to differentiate memory added by * the platform statically or via memory hotplug interface. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e506e365d6f1..5669baf2a6fe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3470,18 +3470,6 @@ static inline bool zone_watermark_fast(struct zone *z, unsigned int order, return false; } -bool zone_watermark_ok_safe(struct zone *z, unsigned int order, - unsigned long mark, int highest_zoneidx) -{ - long free_pages = zone_page_state(z, NR_FREE_PAGES); - - if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) - free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); - - return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0, - free_pages); -} - #ifdef CONFIG_NUMA int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE; diff --git a/mm/vmscan.c b/mm/vmscan.c index b620d74b0f66..cc422ad830d6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6736,6 +6736,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * meet watermarks. */ for_each_managed_zone_pgdat(zone, pgdat, i, highest_zoneidx) { + enum zone_stat_item item; unsigned long free_pages; if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) @@ -6748,9 +6749,25 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) * blocks to avoid polluting allocator fallbacks. */ if (defrag_mode) - free_pages = zone_page_state(zone, NR_FREE_PAGES_BLOCKS); + item = NR_FREE_PAGES_BLOCKS; else - free_pages = zone_page_state(zone, NR_FREE_PAGES); + item = NR_FREE_PAGES; + + /* + * When there is a high number of CPUs in the system, + * the cumulative error from the vmstat per-cpu cache + * can blur the line between the watermarks. In that + * case, be safe and get an accurate snapshot. + * + * TODO: NR_FREE_PAGES_BLOCKS moves in steps of + * pageblock_nr_pages, while the vmstat pcp threshold + * is limited to 125. On many configurations that + * counter won't actually be per-cpu cached. But keep + * things simple for now; revisit when somebody cares. + */ + free_pages = zone_page_state(zone, item); + if (zone->percpu_drift_mark && free_pages < zone->percpu_drift_mark) + free_pages = zone_page_state_snapshot(zone, item); if (__zone_watermark_ok(zone, order, mark, highest_zoneidx, 0, free_pages)) -- cgit v1.2.3 From a1f0220f3319057b364d871659ef7c10ab78f795 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 16 Apr 2025 09:45:40 -0400 Subject: mm: vmscan: fix kswapd exit condition in defrag_mode Vlastimil points out an issue with kswapd in defrag_mode not waking up kcompactd reliably. Background: When kswapd is woken for any higher-order request, it initially checks those high-order watermarks to decide if work is necesary. However, it cannot (efficiently) meet the contiguity goal of such a request by itself. So once it has reclaimed a compaction gap, it adjusts the request down to check for free order-0 pages, then wakes kcompactd to coalesce them into larger blocks. In defrag_mode, the initial watermark check needs to be analogously against free pageblocks. However, once kswapd drops the high-order to hand off contiguity work, it also needs to fall back to base page watermarks - otherwise it'll keep reclaiming until blocks are freed. While it appears kcompactd is woken up frequently enough to do most of the compaction work, kswapd ends up overreclaiming by quite a bit: DEFRAGMODE DEFRAGMODE-thispatch Hugealloc Time mean 79381.34 ( +0.00%) 88126.12 ( +11.02%) Hugealloc Time stddev 85852.16 ( +0.00%) 135366.75 ( +57.67%) Kbuild Real time 249.35 ( +0.00%) 226.71 ( -9.04%) Kbuild User time 1249.16 ( +0.00%) 1249.37 ( +0.02%) Kbuild System time 171.76 ( +0.00%) 166.93 ( -2.79%) THP fault alloc 51666.87 ( +0.00%) 52685.60 ( +1.97%) THP fault fallback 16970.00 ( +0.00%) 15951.87 ( -6.00%) Direct compact fail 166.53 ( +0.00%) 178.93 ( +7.40%) Direct compact success 17.13 ( +0.00%) 4.13 ( -71.69%) Compact daemon scanned migrate 3095413.33 ( +0.00%) 9231239.53 ( +198.22%) Compact daemon scanned free 2155966.53 ( +0.00%) 7053692.87 ( +227.17%) Compact direct scanned migrate 265642.47 ( +0.00%) 68388.33 ( -74.26%) Compact direct scanned free 130252.60 ( +0.00%) 55634.87 ( -57.29%) Compact total migrate scanned 3361055.80 ( +0.00%) 9299627.87 ( +176.69%) Compact total free scanned 2286219.13 ( +0.00%) 7109327.73 ( +210.96%) Alloc stall 1890.80 ( +0.00%) 6297.60 ( +232.94%) Pages kswapd scanned 9043558.80 ( +0.00%) 5952576.73 ( -34.18%) Pages kswapd reclaimed 1891708.67 ( +0.00%) 1030645.00 ( -45.52%) Pages direct scanned 1017090.60 ( +0.00%) 2688047.60 ( +164.29%) Pages direct reclaimed 92682.60 ( +0.00%) 309770.53 ( +234.22%) Pages total scanned 10060649.40 ( +0.00%) 8640624.33 ( -14.11%) Pages total reclaimed 1984391.27 ( +0.00%) 1340415.53 ( -32.45%) Swap out 884585.73 ( +0.00%) 417781.93 ( -52.77%) Swap in 287106.27 ( +0.00%) 95589.73 ( -66.71%) File refaults 551697.60 ( +0.00%) 426474.80 ( -22.70%) Link: https://lkml.kernel.org/r/20250416135142.778933-3-hannes@cmpxchg.org Fixes: a211c6550efc ("mm: page_alloc: defrag_mode kswapd/kcompactd watermarks") Signed-off-by: Johannes Weiner Reported-by: Vlastimil Babka Reviewed-by: Vlastimil Babka Cc: Brendan Jackman Signed-off-by: Andrew Morton --- mm/vmscan.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index cc422ad830d6..3783e45bfc92 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -6747,8 +6747,14 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx) /* * In defrag_mode, watermarks must be met in whole * blocks to avoid polluting allocator fallbacks. + * + * However, kswapd usually cannot accomplish this on + * its own and needs kcompactd support. Once it's + * reclaimed a compaction gap, and kswapd_shrink_node + * has dropped order, simply ensure there are enough + * base pages for compaction, wake kcompactd & sleep. */ - if (defrag_mode) + if (defrag_mode && order) item = NR_FREE_PAGES_BLOCKS; else item = NR_FREE_PAGES; -- cgit v1.2.3 From ea21641b6a79f9cdd64f8339983c71c89949dcb5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 16 Apr 2025 11:38:37 +0100 Subject: MAINTAINERS: add section for locking of mm's and VMAs We place this under memory mapping as related to memory mapping abstractions in the form of mm_struct and vm_area_struct (VMA). Now we have separated out mmap/vma locking logic into the mmap_lock.c and mmap_lock.h files, so this should encapsulate the majority of the mm locking logic in the kernel. Suren is best placed to maintain this logic as the core architect of VMA locking as a whole. Link: https://lkml.kernel.org/r/e6ed679a184ca444b20dfa77af96913fd8b5efa0.1744799282.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Suren Baghdasaryan Reviewed-by: Shakeel Butt Acked-by: David Hildenbrand Reviewed-by: Liam R. Howlett Acked-by: Vlastimil Babka Cc: Matthew Wilcox (Oracle) Cc: "Paul E . McKenney" Cc: SeongJae Park Signed-off-by: Andrew Morton --- MAINTAINERS | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 40e1999fd21b..c70f16ed6208 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15574,6 +15574,22 @@ F: mm/vma.h F: mm/vma_internal.h F: tools/testing/vma/ +MEMORY MAPPING - LOCKING +M: Andrew Morton +M: Suren Baghdasaryan +M: Liam R. Howlett +M: Lorenzo Stoakes +R: Vlastimil Babka +R: Shakeel Butt +L: linux-mm@kvack.org +S: Maintained +W: http://www.linux-mm.org +T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm +F: Documentation/mm/process_addrs.rst +F: include/linux/mmap_lock.h +F: include/trace/events/mmap_lock.h +F: mm/mmap_lock.c + MEMORY MAPPING - MADVISE (MEMORY ADVICE) M: Andrew Morton M: Liam R. Howlett -- cgit v1.2.3 From a34d74877c66ce484ad586d806002ceaedd58657 Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 3 Apr 2025 12:31:37 +0300 Subject: PCI: Restore assigned resources fully after release MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PCI resource fitting code in __assign_resources_sorted() runs in multiple steps. A resource that was successfully assigned may have to be released before the next step attempts assignment again. The assign+release cycle is destructive to a start-aligned struct resource (bridge window or IOV resource) because the start field is overwritten with the real address when the resource got assigned. One symptom: pci 0002:00:00.0: bridge window [mem size 0x00100000]: can't assign; bogus alignment Properly restore the resource after releasing it. The start, end, and flags fields must be stored into the related struct pci_dev_resource in order to be able to restore the resource to its original state. Fixes: 96336ec70264 ("PCI: Perform reset_resource() and build fail list in sync") Reported-by: Guenter Roeck Closes: https://lore.kernel.org/r/01eb7d40-f5b5-4ec5-b390-a5c042c30aff@roeck-us.net/ Reported-by: Nicolas Frattaroli Closes: https://lore.kernel.org/r/3578030.5fSG56mABF@workhorse Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Tested-by: Nicolas Frattaroli Tested-by: Guenter Roeck Tested-by: Ondrej Jirman Link: https://patch.msgid.link/20250403093137.1481-1-ilpo.jarvinen@linux.intel.com --- drivers/pci/setup-bus.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 54d6f4fa3ce1..e994c546422c 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -187,6 +187,9 @@ static void pdev_sort_resources(struct pci_dev *dev, struct list_head *head) panic("%s: kzalloc() failed!\n", __func__); tmp->res = r; tmp->dev = dev; + tmp->start = r->start; + tmp->end = r->end; + tmp->flags = r->flags; /* Fallback is smallest one or list is empty */ n = head; @@ -545,6 +548,7 @@ assign: pci_dbg(dev, "%s %pR: releasing\n", res_name, res); release_resource(res); + restore_dev_resource(dev_res); } /* Restore start/end/flags from saved list */ list_for_each_entry(save_res, &save_head, list) -- cgit v1.2.3 From 39e703ed3b48c4262be141072d4f42a8b89a10cc Mon Sep 17 00:00:00 2001 From: Ilpo Järvinen Date: Thu, 17 Apr 2025 15:45:29 +0300 Subject: selftests/pcie_bwctrl: Fix test progs list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit df6f8c4d72ae ("selftests/pcie_bwctrl: Add 'set_pcie_speed.sh' to TEST_PROGS") added set_pcie_speed.sh into TEST_PROGS but that script is a helper that is only being called by set_pcie_cooling_state.sh, not a test case itself. When set_pcie_speed.sh is in TEST_PROGS, selftest harness will execute also it leading to bwctrl selftest errors: # selftests: pcie_bwctrl: set_pcie_speed.sh # cat: /cur_state: No such file or directory not ok 2 selftests: pcie_bwctrl: set_pcie_speed.sh # exit=1 Place set_pcie_speed.sh into TEST_FILES instead to have it included into installed test files but not execute it from the test harness. Fixes: df6f8c4d72ae ("selftests/pcie_bwctrl: Add 'set_pcie_speed.sh' to TEST_PROGS") Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20250417124529.11391-1-ilpo.jarvinen@linux.intel.com --- tools/testing/selftests/pcie_bwctrl/Makefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/pcie_bwctrl/Makefile b/tools/testing/selftests/pcie_bwctrl/Makefile index 48ec048f47af..277f92f9d753 100644 --- a/tools/testing/selftests/pcie_bwctrl/Makefile +++ b/tools/testing/selftests/pcie_bwctrl/Makefile @@ -1,2 +1,3 @@ -TEST_PROGS = set_pcie_cooling_state.sh set_pcie_speed.sh +TEST_PROGS = set_pcie_cooling_state.sh +TEST_FILES = set_pcie_speed.sh include ../lib.mk -- cgit v1.2.3 From 183a08715af1491d381b4e22efd61578fbe05fa5 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Thu, 10 Apr 2025 03:16:26 -0400 Subject: virtgpu: don't reset on shutdown It looks like GPUs are used after shutdown is invoked. Thus, breaking virtio gpu in the shutdown callback is not a good idea - guest hangs attempting to finish console drawing, with these warnings: [ 20.504464] WARNING: CPU: 0 PID: 568 at drivers/gpu/drm/virtio/virtgpu_vq.c:358 virtio_gpu_queue_ctrl_sgs+0x236/0x290 [virtio_gpu] [ 20.505685] Modules linked in: nft_fib_inet nft_fib_ipv4 nft_fib_ipv6 nft_fib nft_reject_inet nf_reject_ipv4 nf_reject_ipv6 nft_reject nft_ct nft_chain_nat nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 rfkill ip_set nf_tables nfnetlink vfat fat intel_rapl_msr intel_rapl_common intel_uncore_frequency_common nfit libnvdimm kvm_intel kvm rapl iTCO_wdt iTCO_vendor_support virtio_gpu virtio_dma_buf pcspkr drm_shmem_helper i2c_i801 drm_kms_helper lpc_ich i2c_smbus virtio_balloon joydev drm fuse xfs libcrc32c ahci libahci crct10dif_pclmul crc32_pclmul crc32c_intel libata virtio_net ghash_clmulni_intel net_failover virtio_blk failover serio_raw dm_mirror dm_region_hash dm_log dm_mod [ 20.511847] CPU: 0 PID: 568 Comm: kworker/0:3 Kdump: loaded Tainted: G W ------- --- 5.14.0-578.6675_1757216455.el9.x86_64 #1 [ 20.513157] Hardware name: Red Hat KVM/RHEL, BIOS edk2-20241117-3.el9 11/17/2024 [ 20.513918] Workqueue: events drm_fb_helper_damage_work [drm_kms_helper] [ 20.514626] RIP: 0010:virtio_gpu_queue_ctrl_sgs+0x236/0x290 [virtio_gpu] [ 20.515332] Code: 00 00 48 85 c0 74 0c 48 8b 78 08 48 89 ee e8 51 50 00 00 65 ff 0d 42 e3 74 3f 0f 85 69 ff ff ff 0f 1f 44 00 00 e9 5f ff ff ff <0f> 0b e9 3f ff ff ff 48 83 3c 24 00 74 0e 49 8b 7f 40 48 85 ff 74 [ 20.517272] RSP: 0018:ff34f0a8c0787ad8 EFLAGS: 00010282 [ 20.517820] RAX: 00000000fffffffb RBX: 0000000000000000 RCX: 0000000000000820 [ 20.518565] RDX: 0000000000000000 RSI: ff34f0a8c0787be0 RDI: ff218bef03a26300 [ 20.519308] RBP: ff218bef03a26300 R08: 0000000000000001 R09: ff218bef07224360 [ 20.520059] R10: 0000000000008dc0 R11: 0000000000000002 R12: ff218bef02630028 [ 20.520806] R13: ff218bef0263fb48 R14: ff218bef00cb8000 R15: ff218bef07224360 [ 20.521555] FS: 0000000000000000(0000) GS:ff218bef7ba00000(0000) knlGS:0000000000000000 [ 20.522397] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 20.522996] CR2: 000055ac4f7871c0 CR3: 000000010b9f2002 CR4: 0000000000771ef0 [ 20.523740] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 20.524477] DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 [ 20.525223] PKRU: 55555554 [ 20.525515] Call Trace: [ 20.525777] [ 20.526003] ? show_trace_log_lvl+0x1c4/0x2df [ 20.526464] ? show_trace_log_lvl+0x1c4/0x2df [ 20.526925] ? virtio_gpu_queue_fenced_ctrl_buffer+0x82/0x2c0 [virtio_gpu] [ 20.527643] ? virtio_gpu_queue_ctrl_sgs+0x236/0x290 [virtio_gpu] [ 20.528282] ? __warn+0x7e/0xd0 [ 20.528621] ? virtio_gpu_queue_ctrl_sgs+0x236/0x290 [virtio_gpu] [ 20.529256] ? report_bug+0x100/0x140 [ 20.529643] ? handle_bug+0x3c/0x70 [ 20.530010] ? exc_invalid_op+0x14/0x70 [ 20.530421] ? asm_exc_invalid_op+0x16/0x20 [ 20.530862] ? virtio_gpu_queue_ctrl_sgs+0x236/0x290 [virtio_gpu] [ 20.531506] ? virtio_gpu_queue_ctrl_sgs+0x174/0x290 [virtio_gpu] [ 20.532148] virtio_gpu_queue_fenced_ctrl_buffer+0x82/0x2c0 [virtio_gpu] [ 20.532843] virtio_gpu_primary_plane_update+0x3e2/0x460 [virtio_gpu] [ 20.533520] drm_atomic_helper_commit_planes+0x108/0x320 [drm_kms_helper] [ 20.534233] drm_atomic_helper_commit_tail+0x45/0x80 [drm_kms_helper] [ 20.534914] commit_tail+0xd2/0x130 [drm_kms_helper] [ 20.535446] drm_atomic_helper_commit+0x11b/0x140 [drm_kms_helper] [ 20.536097] drm_atomic_commit+0xa4/0xe0 [drm] [ 20.536588] ? __pfx___drm_printfn_info+0x10/0x10 [drm] [ 20.537162] drm_atomic_helper_dirtyfb+0x192/0x270 [drm_kms_helper] [ 20.537823] drm_fbdev_shmem_helper_fb_dirty+0x43/0xa0 [drm_shmem_helper] [ 20.538536] drm_fb_helper_damage_work+0x87/0x160 [drm_kms_helper] [ 20.539188] process_one_work+0x194/0x380 [ 20.539612] worker_thread+0x2fe/0x410 [ 20.540007] ? __pfx_worker_thread+0x10/0x10 [ 20.540456] kthread+0xdd/0x100 [ 20.540791] ? __pfx_kthread+0x10/0x10 [ 20.541190] ret_from_fork+0x29/0x50 [ 20.541566] [ 20.541802] ---[ end trace 0000000000000000 ]--- It looks like the shutdown is called in the middle of console drawing, so we should either wait for it to finish, or let drm handle the shutdown. This patch implements this second option: Add an option for drivers to bypass the common break+reset handling. As DRM is careful to flush/synchronize outstanding buffers, it looks like GPU can just have a NOP there. Reviewed-by: Eric Auger Tested-by: Eric Auger Fixes: 8bd2fa086a04 ("virtio: break and reset virtio devices on device_shutdown()") Cc: Eric Auger Cc: Jocelyn Falempe Signed-off-by: Michael S. Tsirkin Message-Id: <8490dbeb6f79ed039e6c11d121002618972538a3.1744293540.git.mst@redhat.com> --- drivers/gpu/drm/virtio/virtgpu_drv.c | 9 +++++++++ drivers/virtio/virtio.c | 6 ++++++ include/linux/virtio.h | 3 +++ 3 files changed, 18 insertions(+) diff --git a/drivers/gpu/drm/virtio/virtgpu_drv.c b/drivers/gpu/drm/virtio/virtgpu_drv.c index 2d88e390feb4..e32e680c7197 100644 --- a/drivers/gpu/drm/virtio/virtgpu_drv.c +++ b/drivers/gpu/drm/virtio/virtgpu_drv.c @@ -128,6 +128,14 @@ static void virtio_gpu_remove(struct virtio_device *vdev) drm_dev_put(dev); } +static void virtio_gpu_shutdown(struct virtio_device *vdev) +{ + /* + * drm does its own synchronization on shutdown. + * Do nothing here, opt out of device reset. + */ +} + static void virtio_gpu_config_changed(struct virtio_device *vdev) { struct drm_device *dev = vdev->priv; @@ -162,6 +170,7 @@ static struct virtio_driver virtio_gpu_driver = { .id_table = id_table, .probe = virtio_gpu_probe, .remove = virtio_gpu_remove, + .shutdown = virtio_gpu_shutdown, .config_changed = virtio_gpu_config_changed }; diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c index 150753c3b578..95d5d7993e5b 100644 --- a/drivers/virtio/virtio.c +++ b/drivers/virtio/virtio.c @@ -407,6 +407,12 @@ static void virtio_dev_shutdown(struct device *_d) if (!drv) return; + /* If the driver has its own shutdown method, use that. */ + if (drv->shutdown) { + drv->shutdown(dev); + return; + } + /* * Some devices get wedged if you kick them after they are * reset. Mark all vqs as broken to make sure we don't. diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 4d16c13d0df5..64cb4b04be7a 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -220,6 +220,8 @@ size_t virtio_max_dma_size(const struct virtio_device *vdev); * occurs. * @reset_done: optional function to call after transport specific reset * operation has finished. + * @shutdown: synchronize with the device on shutdown. If provided, replaces + * the virtio core implementation. */ struct virtio_driver { struct device_driver driver; @@ -237,6 +239,7 @@ struct virtio_driver { int (*restore)(struct virtio_device *dev); int (*reset_prepare)(struct virtio_device *dev); int (*reset_done)(struct virtio_device *dev); + void (*shutdown)(struct virtio_device *dev); }; #define drv_to_virtio(__drv) container_of_const(__drv, struct virtio_driver, driver) -- cgit v1.2.3 From fbd3039a64b01b769040677c4fc68badeca8e3b2 Mon Sep 17 00:00:00 2001 From: Halil Pasic Date: Sat, 22 Mar 2025 01:29:54 +0100 Subject: virtio_console: fix missing byte order handling for cols and rows As per virtio spec the fields cols and rows are specified as little endian. Although there is no legacy interface requirement that would state that cols and rows need to be handled as native endian when legacy interface is used, unlike for the fields of the adjacent struct virtio_console_control, I decided to err on the side of caution based on some non-conclusive virtio spec repo archaeology and opt for using virtio16_to_cpu() much like for virtio_console_control.event. Strictly by the letter of the spec virtio_le_to_cpu() would have been sufficient. But when the legacy interface is not used, it boils down to the same. And when using the legacy interface, the device formatting these as little endian when the guest is big endian would surprise me more than it using guest native byte order (which would make it compatible with the current implementation). Nevertheless somebody trying to implement the spec following it to the letter could end up forcing little endian byte order when the legacy interface is in use. So IMHO this ultimately needs a judgement call by the maintainers. Fixes: 8345adbf96fc1 ("virtio: console: Accept console size along with resize control message") Signed-off-by: Halil Pasic Cc: stable@vger.kernel.org # v2.6.35+ Message-Id: <20250322002954.3129282-1-pasic@linux.ibm.com> Signed-off-by: Michael S. Tsirkin --- drivers/char/virtio_console.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 5f04951d0dd4..216c5115637d 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1576,8 +1576,8 @@ static void handle_control_message(struct virtio_device *vdev, break; case VIRTIO_CONSOLE_RESIZE: { struct { - __u16 rows; - __u16 cols; + __virtio16 rows; + __virtio16 cols; } size; if (!is_console_port(port)) @@ -1585,7 +1585,8 @@ static void handle_control_message(struct virtio_device *vdev, memcpy(&size, buf->buf + buf->offset + sizeof(*cpkt), sizeof(size)); - set_console_size(port, size.rows, size.cols); + set_console_size(port, virtio16_to_cpu(vdev, size.rows), + virtio16_to_cpu(vdev, size.cols)); port->cons.hvc->irq_requested = 1; resize_console(port); -- cgit v1.2.3 From 5326ab737a47278dbd16ed3ee7380b26c7056ddd Mon Sep 17 00:00:00 2001 From: Maximilian Immanuel Brandtner Date: Mon, 24 Mar 2025 15:42:46 +0100 Subject: virtio_console: fix order of fields cols and rows According to section 5.3.6.2 (Multiport Device Operation) of the virtio spec(version 1.2) a control buffer with the event VIRTIO_CONSOLE_RESIZE is followed by a virtio_console_resize struct containing cols then rows. The kernel implements this the wrong way around (rows then cols) resulting in the two values being swapped. Signed-off-by: Maximilian Immanuel Brandtner Message-Id: <20250324144300.905535-1-maxbr@linux.ibm.com> Signed-off-by: Michael S. Tsirkin --- drivers/char/virtio_console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 216c5115637d..088182e54deb 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1576,8 +1576,8 @@ static void handle_control_message(struct virtio_device *vdev, break; case VIRTIO_CONSOLE_RESIZE: { struct { - __virtio16 rows; __virtio16 cols; + __virtio16 rows; } size; if (!is_console_port(port)) -- cgit v1.2.3 From fec0abf52609c20279243699d08b660c142ce0aa Mon Sep 17 00:00:00 2001 From: Stefano Garzarella Date: Thu, 27 Mar 2025 13:44:35 +0100 Subject: vhost_task: fix vhost_task_create() documentation Commit cb380909ae3b ("vhost: return task creation error instead of NULL") changed the return value of vhost_task_create(), but did not update the documentation. Reflect the change in the documentation: on an error, vhost_task_create() returns an ERR_PTR() and no longer NULL. Signed-off-by: Stefano Garzarella Message-Id: <20250327124435.142831-1-sgarzare@redhat.com> Signed-off-by: Michael S. Tsirkin --- kernel/vhost_task.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/vhost_task.c b/kernel/vhost_task.c index 2ef2e1b80091..2f844c279a3e 100644 --- a/kernel/vhost_task.c +++ b/kernel/vhost_task.c @@ -111,7 +111,7 @@ EXPORT_SYMBOL_GPL(vhost_task_stop); * @arg: data to be passed to fn and handled_kill * @name: the thread's name * - * This returns a specialized task for use by the vhost layer or NULL on + * This returns a specialized task for use by the vhost layer or ERR_PTR() on * failure. The returned task is inactive, and the caller must fire it up * through vhost_task_start(). */ -- cgit v1.2.3 From f591cf9fce724e5075cc67488c43c6e39e8cbe27 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 2 Apr 2025 23:29:46 -0700 Subject: vhost-scsi: protect vq->log_used with vq->mutex The vhost-scsi completion path may access vq->log_base when vq->log_used is already set to false. vhost-thread QEMU-thread vhost_scsi_complete_cmd_work() -> vhost_add_used() -> vhost_add_used_n() if (unlikely(vq->log_used)) QEMU disables vq->log_used via VHOST_SET_VRING_ADDR. mutex_lock(&vq->mutex); vq->log_used = false now! mutex_unlock(&vq->mutex); QEMU gfree(vq->log_base) log_used() -> log_write(vq->log_base) Assuming the VMM is QEMU. The vq->log_base is from QEMU userpace and can be reclaimed via gfree(). As a result, this causes invalid memory writes to QEMU userspace. The control queue path has the same issue. Signed-off-by: Dongli Zhang Acked-by: Jason Wang Reviewed-by: Mike Christie Message-Id: <20250403063028.16045-2-dongli.zhang@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/scsi.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index f6f5a7ac7894..f846f2aa7c87 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -627,6 +627,9 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) int ret; llnode = llist_del_all(&svq->completion_list); + + mutex_lock(&svq->vq.mutex); + llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) { se_cmd = &cmd->tvc_se_cmd; @@ -660,6 +663,8 @@ static void vhost_scsi_complete_cmd_work(struct vhost_work *work) vhost_scsi_release_cmd_res(se_cmd); } + mutex_unlock(&svq->vq.mutex); + if (signal) vhost_signal(&svq->vs->dev, &svq->vq); } @@ -1432,8 +1437,11 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work *work) else resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED; + mutex_lock(&tmf->svq->vq.mutex); vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs, tmf->vq_desc, &tmf->resp_iov, resp_code); + mutex_unlock(&tmf->svq->vq.mutex); + vhost_scsi_release_tmf_res(tmf); } -- cgit v1.2.3 From b182687135474d7ed905a07cc6cb2734b359e13e Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 2 Apr 2025 23:29:47 -0700 Subject: vhost-scsi: Fix vhost_scsi_send_bad_target() Although the support of VIRTIO_F_ANY_LAYOUT + VIRTIO_F_VERSION_1 was signaled by the commit 664ed90e621c ("vhost/scsi: Set VIRTIO_F_ANY_LAYOUT + VIRTIO_F_VERSION_1 feature bits"), vhost_scsi_send_bad_target() still assumes the response in a single descriptor. In addition, although vhost_scsi_send_bad_target() is used by both I/O queue and control queue, the response header is always virtio_scsi_cmd_resp. It is required to use virtio_scsi_ctrl_tmf_resp or virtio_scsi_ctrl_an_resp for control queue. Fixes: 664ed90e621c ("vhost/scsi: Set VIRTIO_F_ANY_LAYOUT + VIRTIO_F_VERSION_1 feature bits") Signed-off-by: Dongli Zhang Acked-by: Jason Wang Reviewed-by: Mike Christie Message-Id: <20250403063028.16045-3-dongli.zhang@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/scsi.c | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index f846f2aa7c87..59d907b94c5e 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -1015,23 +1015,46 @@ vhost_scsi_send_status(struct vhost_scsi *vs, struct vhost_virtqueue *vq, pr_err("Faulted on virtio_scsi_cmd_resp\n"); } +#define TYPE_IO_CMD 0 +#define TYPE_CTRL_TMF 1 +#define TYPE_CTRL_AN 2 + static void vhost_scsi_send_bad_target(struct vhost_scsi *vs, struct vhost_virtqueue *vq, - int head, unsigned out) + struct vhost_scsi_ctx *vc, int type) { - struct virtio_scsi_cmd_resp __user *resp; - struct virtio_scsi_cmd_resp rsp; + union { + struct virtio_scsi_cmd_resp cmd; + struct virtio_scsi_ctrl_tmf_resp tmf; + struct virtio_scsi_ctrl_an_resp an; + } rsp; + struct iov_iter iov_iter; + size_t rsp_size; int ret; memset(&rsp, 0, sizeof(rsp)); - rsp.response = VIRTIO_SCSI_S_BAD_TARGET; - resp = vq->iov[out].iov_base; - ret = __copy_to_user(resp, &rsp, sizeof(rsp)); - if (!ret) - vhost_add_used_and_signal(&vs->dev, vq, head, 0); + + if (type == TYPE_IO_CMD) { + rsp_size = sizeof(struct virtio_scsi_cmd_resp); + rsp.cmd.response = VIRTIO_SCSI_S_BAD_TARGET; + } else if (type == TYPE_CTRL_TMF) { + rsp_size = sizeof(struct virtio_scsi_ctrl_tmf_resp); + rsp.tmf.response = VIRTIO_SCSI_S_BAD_TARGET; + } else { + rsp_size = sizeof(struct virtio_scsi_ctrl_an_resp); + rsp.an.response = VIRTIO_SCSI_S_BAD_TARGET; + } + + iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[vc->out], vc->in, + rsp_size); + + ret = copy_to_iter(&rsp, rsp_size, &iov_iter); + + if (likely(ret == rsp_size)) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); else - pr_err("Faulted on virtio_scsi_cmd_resp\n"); + pr_err("Faulted on virtio scsi type=%d\n", type); } static int @@ -1395,7 +1418,7 @@ err: if (ret == -ENXIO) break; else if (ret == -EIO) - vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + vhost_scsi_send_bad_target(vs, vq, &vc, TYPE_IO_CMD); else if (ret == -ENOMEM) vhost_scsi_send_status(vs, vq, vc.head, vc.out, SAM_STAT_TASK_SET_FULL); @@ -1631,7 +1654,10 @@ err: if (ret == -ENXIO) break; else if (ret == -EIO) - vhost_scsi_send_bad_target(vs, vq, vc.head, vc.out); + vhost_scsi_send_bad_target(vs, vq, &vc, + v_req.type == VIRTIO_SCSI_T_TMF ? + TYPE_CTRL_TMF : + TYPE_CTRL_AN); } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); out: mutex_unlock(&vq->mutex); -- cgit v1.2.3 From 58465d86071b61415e25fb054201f61e83d21465 Mon Sep 17 00:00:00 2001 From: Dongli Zhang Date: Wed, 2 Apr 2025 23:29:48 -0700 Subject: vhost-scsi: Fix vhost_scsi_send_status() Although the support of VIRTIO_F_ANY_LAYOUT + VIRTIO_F_VERSION_1 was signaled by the commit 664ed90e621c ("vhost/scsi: Set VIRTIO_F_ANY_LAYOUT + VIRTIO_F_VERSION_1 feature bits"), vhost_scsi_send_bad_target() still assumes the response in a single descriptor. Similar issue in vhost_scsi_send_bad_target() has been fixed in previous commit. In addition, similar issue for vhost_scsi_complete_cmd_work() has been fixed by the commit 6dd88fd59da8 ("vhost-scsi: unbreak any layout for response"). Fixes: 3ca51662f818 ("vhost-scsi: Add better resource allocation failure handling") Signed-off-by: Dongli Zhang Acked-by: Jason Wang Reviewed-by: Mike Christie Message-Id: <20250403063028.16045-4-dongli.zhang@oracle.com> Signed-off-by: Michael S. Tsirkin --- drivers/vhost/scsi.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c index 59d907b94c5e..26bcf3a7f70c 100644 --- a/drivers/vhost/scsi.c +++ b/drivers/vhost/scsi.c @@ -999,18 +999,22 @@ static void vhost_scsi_target_queue_cmd(struct vhost_scsi_nexus *nexus, static void vhost_scsi_send_status(struct vhost_scsi *vs, struct vhost_virtqueue *vq, - int head, unsigned int out, u8 status) + struct vhost_scsi_ctx *vc, u8 status) { - struct virtio_scsi_cmd_resp __user *resp; struct virtio_scsi_cmd_resp rsp; + struct iov_iter iov_iter; int ret; memset(&rsp, 0, sizeof(rsp)); rsp.status = status; - resp = vq->iov[out].iov_base; - ret = __copy_to_user(resp, &rsp, sizeof(rsp)); - if (!ret) - vhost_add_used_and_signal(&vs->dev, vq, head, 0); + + iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[vc->out], vc->in, + sizeof(rsp)); + + ret = copy_to_iter(&rsp, sizeof(rsp), &iov_iter); + + if (likely(ret == sizeof(rsp))) + vhost_add_used_and_signal(&vs->dev, vq, vc->head, 0); else pr_err("Faulted on virtio_scsi_cmd_resp\n"); } @@ -1420,7 +1424,7 @@ err: else if (ret == -EIO) vhost_scsi_send_bad_target(vs, vq, &vc, TYPE_IO_CMD); else if (ret == -ENOMEM) - vhost_scsi_send_status(vs, vq, vc.head, vc.out, + vhost_scsi_send_status(vs, vq, &vc, SAM_STAT_TASK_SET_FULL); } while (likely(!vhost_exceeds_weight(vq, ++c, 0))); out: -- cgit v1.2.3 From d481ee35247d2a01764667a25f6f512c292ba42d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 18 Apr 2025 10:12:08 -0400 Subject: tracing: selftests: Add testing a user string to filters Running the following commands was broken: # cd /sys/kernel/tracing # echo "filename.ustring ~ \"/proc*\"" > events/syscalls/sys_enter_openat/filter # echo 1 > events/syscalls/sys_enter_openat/enable # ls /proc/$$/maps # cat trace And would produce nothing when it should have produced something like: ls-1192 [007] ..... 8169.828333: sys_openat(dfd: ffffffffffffff9c, filename: 7efc18359904, flags: 80000, mode: 0) Add a test to check this case so that it will be caught if it breaks again. Link: https://lore.kernel.org/linux-trace-kernel/20250417183003.505835fb@gandalf.local.home/ Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Andrew Morton Cc: Shuah Khan Link: https://lore.kernel.org/20250418101208.38dc81f5@gandalf.local.home Signed-off-by: Steven Rostedt (Google) --- .../ftrace/test.d/filter/event-filter-function.tc | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc index 118247b8dd84..c62165fabd0c 100644 --- a/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc +++ b/tools/testing/selftests/ftrace/test.d/filter/event-filter-function.tc @@ -80,6 +80,26 @@ if [ $misscnt -gt 0 ]; then exit_fail fi +# Check strings too +if [ -f events/syscalls/sys_enter_openat/filter ]; then + DIRNAME=`basename $TMPDIR` + echo "filename.ustring ~ \"*$DIRNAME*\"" > events/syscalls/sys_enter_openat/filter + echo 1 > events/syscalls/sys_enter_openat/enable + echo 1 > tracing_on + ls /bin/sh + nocnt=`grep openat trace | wc -l` + ls $TMPDIR + echo 0 > tracing_on + hitcnt=`grep openat trace | wc -l`; + echo 0 > events/syscalls/sys_enter_openat/enable + if [ $nocnt -gt 0 ]; then + exit_fail + fi + if [ $hitcnt -eq 0 ]; then + exit_fail + fi +fi + reset_events_filter exit 0 -- cgit v1.2.3 From dc915672f9176799e48ac23a155f48742b15ec6c Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 17 Apr 2025 17:29:33 -0700 Subject: cxl: Fix devm host device for CXL fwctl initialization Testing revealed the following error message for a CXL memdev that has Feature support: [ 56.690430] cxl mem0: Resources present before probing Attach the allocation of cxl_fwctl to the parent device of cxl_memdev. devm_add_* calls for cxl_memdev should not happen before the memdev probe function or outside the scope of the memdev driver. cxl_test missed this bug because cxl_test always arranges for the cxl_mem driver to be loaded before cxl_mock_mem runs. So the driver core always finds the devres list idle in that case. [DJ: Updated subject title and added commit log suggestion from djbw] Fixes: 858ce2f56b52 ("cxl: Add FWCTL support to CXL") Reviewed-by: Dan Williams Reviewed-by: Alison Schofield Link: https://lore.kernel.org/linux-cxl/6801aea053466_71fe2944c@dwillia2-xfh.jf.intel.com.notmuch/ Link: https://patch.msgid.link/20250418002933.406439-1-dave.jiang@intel.com Signed-off-by: Dave Jiang --- drivers/cxl/core/features.c | 4 ++-- drivers/cxl/pci.c | 2 +- include/cxl/features.h | 5 +++-- tools/testing/cxl/test/mem.c | 2 +- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/cxl/core/features.c b/drivers/cxl/core/features.c index f4daefe3180e..150a1776480a 100644 --- a/drivers/cxl/core/features.c +++ b/drivers/cxl/core/features.c @@ -677,7 +677,7 @@ static void free_memdev_fwctl(void *_fwctl_dev) fwctl_put(fwctl_dev); } -int devm_cxl_setup_fwctl(struct cxl_memdev *cxlmd) +int devm_cxl_setup_fwctl(struct device *host, struct cxl_memdev *cxlmd) { struct cxl_dev_state *cxlds = cxlmd->cxlds; struct cxl_features_state *cxlfs; @@ -700,7 +700,7 @@ int devm_cxl_setup_fwctl(struct cxl_memdev *cxlmd) if (rc) return rc; - return devm_add_action_or_reset(&cxlmd->dev, free_memdev_fwctl, + return devm_add_action_or_reset(host, free_memdev_fwctl, no_free_ptr(fwctl_dev)); } EXPORT_SYMBOL_NS_GPL(devm_cxl_setup_fwctl, "CXL"); diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c index 7b14a154463c..785aa2af5eaa 100644 --- a/drivers/cxl/pci.c +++ b/drivers/cxl/pci.c @@ -1018,7 +1018,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (rc) return rc; - rc = devm_cxl_setup_fwctl(cxlmd); + rc = devm_cxl_setup_fwctl(&pdev->dev, cxlmd); if (rc) dev_dbg(&pdev->dev, "No CXL FWCTL setup\n"); diff --git a/include/cxl/features.h b/include/cxl/features.h index a3bb34694c06..5f7f842765a5 100644 --- a/include/cxl/features.h +++ b/include/cxl/features.h @@ -66,7 +66,7 @@ struct cxl_memdev; #ifdef CONFIG_CXL_FEATURES inline struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds); int devm_cxl_setup_features(struct cxl_dev_state *cxlds); -int devm_cxl_setup_fwctl(struct cxl_memdev *cxlmd); +int devm_cxl_setup_fwctl(struct device *host, struct cxl_memdev *cxlmd); #else static inline struct cxl_features_state *to_cxlfs(struct cxl_dev_state *cxlds) { @@ -78,7 +78,8 @@ static inline int devm_cxl_setup_features(struct cxl_dev_state *cxlds) return -EOPNOTSUPP; } -static inline int devm_cxl_setup_fwctl(struct cxl_memdev *cxlmd) +static inline int devm_cxl_setup_fwctl(struct device *host, + struct cxl_memdev *cxlmd) { return -EOPNOTSUPP; } diff --git a/tools/testing/cxl/test/mem.c b/tools/testing/cxl/test/mem.c index f2957a3e36fe..bf9caa908f89 100644 --- a/tools/testing/cxl/test/mem.c +++ b/tools/testing/cxl/test/mem.c @@ -1780,7 +1780,7 @@ static int cxl_mock_mem_probe(struct platform_device *pdev) if (rc) return rc; - rc = devm_cxl_setup_fwctl(cxlmd); + rc = devm_cxl_setup_fwctl(&pdev->dev, cxlmd); if (rc) dev_dbg(dev, "No CXL FWCTL setup\n"); -- cgit v1.2.3 From 25174d5cd22f0977034892672a0287f7febcec1c Mon Sep 17 00:00:00 2001 From: Li Ming Date: Thu, 10 Apr 2025 10:45:21 +0800 Subject: cxl/feature: Update out_len in set feature failure case CXL subsystem supports userspace to configure features via fwctl interface, it will configure features by using Set Feature command. Whatever Set Feature succeeds or fails, CXL driver always needs to return a structure fwctl_rpc_cxl_out to caller, and returned size is updated in a out_len parameter. The out_len should be updated not only when the set feature succeeds, but also when the set feature fails. Fixes: eb5dfcb9e36d ("cxl: Add support to handle user feature commands for set feature") Signed-off-by: Li Ming Reviewed-by: Dave Jiang Reviewed-by: Ira Weiny Reviewed-by: Jonathan Cameron Link: https://patch.msgid.link/20250410024521.514095-1-ming.li@zohomail.com Signed-off-by: Dave Jiang --- drivers/cxl/core/features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/features.c b/drivers/cxl/core/features.c index 150a1776480a..1498e2369c37 100644 --- a/drivers/cxl/core/features.c +++ b/drivers/cxl/core/features.c @@ -528,13 +528,13 @@ static void *cxlctl_set_feature(struct cxl_features_state *cxlfs, rc = cxl_set_feature(cxl_mbox, &feat_in->uuid, feat_in->version, feat_in->feat_data, data_size, flags, offset, &return_code); + *out_len = sizeof(*rpc_out); if (rc) { rpc_out->retval = return_code; return no_free_ptr(rpc_out); } rpc_out->retval = CXL_MBOX_CMD_RC_SUCCESS; - *out_len = sizeof(*rpc_out); return no_free_ptr(rpc_out); } -- cgit v1.2.3 From 117c3b21d3c79af56750f18a54f2c468f30c8a45 Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Fri, 18 Apr 2025 10:31:29 +0100 Subject: arm64: Rework checks for broken Cavium HW in the PI code MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Calling into the MIDR checking framework from the PI code has recently become much harder, due to the new fancy "multi-MIDR" support that relies on tables being populated at boot time, but not that early that they are available to the PI code. There are additional issues with this framework, as the code really isn't position independend *at all*. This leads to some ugly breakages, as reported by Ada. It so appears that the only reason for the PI code to call into the MIDR checking code is to cope with The Most Broken ARM64 System Ever, aka Cavium ThunderX, which cannot deal with nG attributes that result of the combination of KASLR and KPTI as a consequence of Erratum 27456. Duplicate the check for the erratum in the PI code, removing the dependency on the bulk of the MIDR checking framework. This allows dropping that same check from kaslr_requires_kpti(), as the KPTI code already relies on the ARM64_WORKAROUND_CAVIUM_27456 cap. Fixes: c8c2647e69bed ("arm64: Make  _midr_in_range_list() an exported function") Reported-by: Ada Couprie Diaz Signed-off-by: Marc Zyngier Link: https://lore.kernel.org/r/3d97e45a-23cf-419b-9b6f-140b4d88de7b@arm.com Cc: Catalin Marinas Cc: Will Deacon Cc: Shameer Kolothum Cc: Oliver Upton Reviewed-by: Catalin Marinas Link: https://lore.kernel.org/r/20250418093129.1755739-1-maz@kernel.org Signed-off-by: Oliver Upton --- arch/arm64/include/asm/mmu.h | 11 ----------- arch/arm64/kernel/cpu_errata.c | 2 +- arch/arm64/kernel/image-vars.h | 4 ---- arch/arm64/kernel/pi/map_kernel.c | 25 ++++++++++++++++++++++++- 4 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/arm64/include/asm/mmu.h b/arch/arm64/include/asm/mmu.h index 30a29e88994b..6e8aa8e72601 100644 --- a/arch/arm64/include/asm/mmu.h +++ b/arch/arm64/include/asm/mmu.h @@ -94,17 +94,6 @@ static inline bool kaslr_requires_kpti(void) return false; } - /* - * Systems affected by Cavium erratum 24756 are incompatible - * with KPTI. - */ - if (IS_ENABLED(CONFIG_CAVIUM_ERRATUM_27456)) { - extern const struct midr_range cavium_erratum_27456_cpus[]; - - if (is_midr_in_range_list(cavium_erratum_27456_cpus)) - return false; - } - return true; } diff --git a/arch/arm64/kernel/cpu_errata.c b/arch/arm64/kernel/cpu_errata.c index b55f5f705750..6b0ad5070d3e 100644 --- a/arch/arm64/kernel/cpu_errata.c +++ b/arch/arm64/kernel/cpu_errata.c @@ -335,7 +335,7 @@ static const struct midr_range cavium_erratum_23154_cpus[] = { #endif #ifdef CONFIG_CAVIUM_ERRATUM_27456 -const struct midr_range cavium_erratum_27456_cpus[] = { +static const struct midr_range cavium_erratum_27456_cpus[] = { /* Cavium ThunderX, T88 pass 1.x - 2.1 */ MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), /* Cavium ThunderX, T81 pass 1.0 */ diff --git a/arch/arm64/kernel/image-vars.h b/arch/arm64/kernel/image-vars.h index 5e3c4b58f279..2004b4f41ade 100644 --- a/arch/arm64/kernel/image-vars.h +++ b/arch/arm64/kernel/image-vars.h @@ -47,10 +47,6 @@ PROVIDE(__pi_id_aa64smfr0_override = id_aa64smfr0_override); PROVIDE(__pi_id_aa64zfr0_override = id_aa64zfr0_override); PROVIDE(__pi_arm64_sw_feature_override = arm64_sw_feature_override); PROVIDE(__pi_arm64_use_ng_mappings = arm64_use_ng_mappings); -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -PROVIDE(__pi_cavium_erratum_27456_cpus = cavium_erratum_27456_cpus); -PROVIDE(__pi_is_midr_in_range_list = is_midr_in_range_list); -#endif PROVIDE(__pi__ctype = _ctype); PROVIDE(__pi_memstart_offset_seed = memstart_offset_seed); diff --git a/arch/arm64/kernel/pi/map_kernel.c b/arch/arm64/kernel/pi/map_kernel.c index e57b043f324b..c6650cfe706c 100644 --- a/arch/arm64/kernel/pi/map_kernel.c +++ b/arch/arm64/kernel/pi/map_kernel.c @@ -207,6 +207,29 @@ static void __init map_fdt(u64 fdt) dsb(ishst); } +/* + * PI version of the Cavium Eratum 27456 detection, which makes it + * impossible to use non-global mappings. + */ +static bool __init ng_mappings_allowed(void) +{ + static const struct midr_range cavium_erratum_27456_cpus[] __initconst = { + /* Cavium ThunderX, T88 pass 1.x - 2.1 */ + MIDR_RANGE(MIDR_THUNDERX, 0, 0, 1, 1), + /* Cavium ThunderX, T81 pass 1.0 */ + MIDR_REV(MIDR_THUNDERX_81XX, 0, 0), + {}, + }; + + for (const struct midr_range *r = cavium_erratum_27456_cpus; r->model; r++) { + if (midr_is_cpu_model_range(read_cpuid_id(), r->model, + r->rv_min, r->rv_max)) + return false; + } + + return true; +} + asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) { static char const chosen_str[] __initconst = "/chosen"; @@ -246,7 +269,7 @@ asmlinkage void __init early_map_kernel(u64 boot_status, void *fdt) u64 kaslr_seed = kaslr_early_init(fdt, chosen); if (kaslr_seed && kaslr_requires_kpti()) - arm64_use_ng_mappings = true; + arm64_use_ng_mappings = ng_mappings_allowed(); kaslr_offset |= kaslr_seed & ~(MIN_KIMG_ALIGN - 1); } -- cgit v1.2.3 From 0747c136753ef44a3b1434a235492ef54081b96e Mon Sep 17 00:00:00 2001 From: Manivannan Sadhasivam Date: Fri, 18 Apr 2025 15:19:02 +0530 Subject: MAINTAINERS: Move Manivannan Sadhasivam as PCI Native host bridge and endpoint maintainer I'm currently maintaining the PCI endpoint subsystem and reviewing the native host bridge and endpoint drivers. However, this affects my endpoint maintainership role since I cannot merge endpoint patches that depend on the controller drivers (which is more common). Moreover, the controller driver patches would also benefit from a helping hand in maintaining them. So I'd like to step up to maintain the native host bridge and endpoint drivers together with the endpoint subsystem. Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Link: https://patch.msgid.link/20250418094905.9983-1-manivannan.sadhasivam@linaro.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 96b827049501..7f8961d3e57c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18633,7 +18633,7 @@ F: drivers/pci/controller/pci-xgene-msi.c PCI NATIVE HOST BRIDGE AND ENDPOINT DRIVERS M: Lorenzo Pieralisi M: Krzysztof Wilczyński -R: Manivannan Sadhasivam +M: Manivannan Sadhasivam R: Rob Herring L: linux-pci@vger.kernel.org S: Supported -- cgit v1.2.3 From 9d78f02503227d3554d26cf8ca73276105c98f3e Mon Sep 17 00:00:00 2001 From: Rob Clark Date: Mon, 17 Mar 2025 08:00:06 -0700 Subject: drm/msm/a6xx+: Don't let IB_SIZE overflow IB_SIZE is only b0..b19. Starting with a6xx gen3, additional fields were added above the IB_SIZE. Accidentially setting them can cause badness. Fix this by properly defining the CP_INDIRECT_BUFFER packet and using the generated builder macro to ensure unintended bits are not set. v2: add missing type attribute for IB_BASE v3: fix offset attribute in xml Reported-by: Connor Abbott Fixes: a83366ef19ea ("drm/msm/a6xx: add A640/A650 to gpulist") Signed-off-by: Rob Clark Patchwork: https://patchwork.freedesktop.org/patch/643396/ --- drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 8 ++++---- drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c index 1820c167fcee..28c659c72493 100644 --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c @@ -242,10 +242,10 @@ static void a6xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) break; fallthrough; case MSM_SUBMIT_CMD_BUF: - OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3); + OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); OUT_RING(ring, lower_32_bits(submit->cmd[i].iova)); OUT_RING(ring, upper_32_bits(submit->cmd[i].iova)); - OUT_RING(ring, submit->cmd[i].size); + OUT_RING(ring, A5XX_CP_INDIRECT_BUFFER_2_IB_SIZE(submit->cmd[i].size)); ibs++; break; } @@ -377,10 +377,10 @@ static void a7xx_submit(struct msm_gpu *gpu, struct msm_gem_submit *submit) break; fallthrough; case MSM_SUBMIT_CMD_BUF: - OUT_PKT7(ring, CP_INDIRECT_BUFFER_PFE, 3); + OUT_PKT7(ring, CP_INDIRECT_BUFFER, 3); OUT_RING(ring, lower_32_bits(submit->cmd[i].iova)); OUT_RING(ring, upper_32_bits(submit->cmd[i].iova)); - OUT_RING(ring, submit->cmd[i].size); + OUT_RING(ring, A5XX_CP_INDIRECT_BUFFER_2_IB_SIZE(submit->cmd[i].size)); ibs++; break; } diff --git a/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml b/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml index 55a35182858c..5a6ae9fc3194 100644 --- a/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml +++ b/drivers/gpu/drm/msm/registers/adreno/adreno_pm4.xml @@ -2259,5 +2259,12 @@ opcode: CP_LOAD_STATE4 (30) (4 dwords) + + + + + + + -- cgit v1.2.3 From aece1cf146741761a1243746db5b72f5ece68290 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Sat, 12 Apr 2025 13:36:51 +0800 Subject: Revert "crypto: testmgr - Add multibuffer acomp testing" This reverts commit 99585c2192cb1ce212876e82ef01d1c98c7f4699. Remove the acomp multibuffer tests as they are buggy. Reported-by: Dmitry Antipov Signed-off-by: Herbert Xu --- crypto/testmgr.c | 147 ++++++++++++++++++++++++------------------------------- 1 file changed, 64 insertions(+), 83 deletions(-) diff --git a/crypto/testmgr.c b/crypto/testmgr.c index abd609d4c8ef..82977ea25db3 100644 --- a/crypto/testmgr.c +++ b/crypto/testmgr.c @@ -58,9 +58,6 @@ module_param(fuzz_iterations, uint, 0644); MODULE_PARM_DESC(fuzz_iterations, "number of fuzz test iterations"); #endif -/* Multibuffer is unlimited. Set arbitrary limit for testing. */ -#define MAX_MB_MSGS 16 - #ifdef CONFIG_CRYPTO_MANAGER_DISABLE_TESTS /* a perfect nop */ @@ -3329,48 +3326,27 @@ static int test_acomp(struct crypto_acomp *tfm, int ctcount, int dtcount) { const char *algo = crypto_tfm_alg_driver_name(crypto_acomp_tfm(tfm)); - struct scatterlist *src = NULL, *dst = NULL; - struct acomp_req *reqs[MAX_MB_MSGS] = {}; - char *decomp_out[MAX_MB_MSGS] = {}; - char *output[MAX_MB_MSGS] = {}; - struct crypto_wait wait; - struct acomp_req *req; - int ret = -ENOMEM; unsigned int i; + char *output, *decomp_out; + int ret; + struct scatterlist src, dst; + struct acomp_req *req; + struct crypto_wait wait; - src = kmalloc_array(MAX_MB_MSGS, sizeof(*src), GFP_KERNEL); - if (!src) - goto out; - dst = kmalloc_array(MAX_MB_MSGS, sizeof(*dst), GFP_KERNEL); - if (!dst) - goto out; - - for (i = 0; i < MAX_MB_MSGS; i++) { - reqs[i] = acomp_request_alloc(tfm); - if (!reqs[i]) - goto out; - - acomp_request_set_callback(reqs[i], - CRYPTO_TFM_REQ_MAY_SLEEP | - CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - if (i) - acomp_request_chain(reqs[i], reqs[0]); - - output[i] = kmalloc(COMP_BUF_SIZE, GFP_KERNEL); - if (!output[i]) - goto out; + output = kmalloc(COMP_BUF_SIZE, GFP_KERNEL); + if (!output) + return -ENOMEM; - decomp_out[i] = kmalloc(COMP_BUF_SIZE, GFP_KERNEL); - if (!decomp_out[i]) - goto out; + decomp_out = kmalloc(COMP_BUF_SIZE, GFP_KERNEL); + if (!decomp_out) { + kfree(output); + return -ENOMEM; } for (i = 0; i < ctcount; i++) { unsigned int dlen = COMP_BUF_SIZE; int ilen = ctemplate[i].inlen; void *input_vec; - int j; input_vec = kmemdup(ctemplate[i].input, ilen, GFP_KERNEL); if (!input_vec) { @@ -3378,61 +3354,70 @@ static int test_acomp(struct crypto_acomp *tfm, goto out; } + memset(output, 0, dlen); crypto_init_wait(&wait); - sg_init_one(src, input_vec, ilen); + sg_init_one(&src, input_vec, ilen); + sg_init_one(&dst, output, dlen); - for (j = 0; j < MAX_MB_MSGS; j++) { - sg_init_one(dst + j, output[j], dlen); - acomp_request_set_params(reqs[j], src, dst + j, ilen, dlen); + req = acomp_request_alloc(tfm); + if (!req) { + pr_err("alg: acomp: request alloc failed for %s\n", + algo); + kfree(input_vec); + ret = -ENOMEM; + goto out; } - req = reqs[0]; + acomp_request_set_params(req, &src, &dst, ilen, dlen); + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + ret = crypto_wait_req(crypto_acomp_compress(req), &wait); if (ret) { pr_err("alg: acomp: compression failed on test %d for %s: ret=%d\n", i + 1, algo, -ret); kfree(input_vec); + acomp_request_free(req); goto out; } ilen = req->dlen; dlen = COMP_BUF_SIZE; + sg_init_one(&src, output, ilen); + sg_init_one(&dst, decomp_out, dlen); crypto_init_wait(&wait); - for (j = 0; j < MAX_MB_MSGS; j++) { - sg_init_one(src + j, output[j], ilen); - sg_init_one(dst + j, decomp_out[j], dlen); - acomp_request_set_params(reqs[j], src + j, dst + j, ilen, dlen); - } - - crypto_wait_req(crypto_acomp_decompress(req), &wait); - for (j = 0; j < MAX_MB_MSGS; j++) { - ret = reqs[j]->base.err; - if (ret) { - pr_err("alg: acomp: compression failed on test %d (%d) for %s: ret=%d\n", - i + 1, j, algo, -ret); - kfree(input_vec); - goto out; - } + acomp_request_set_params(req, &src, &dst, ilen, dlen); - if (reqs[j]->dlen != ctemplate[i].inlen) { - pr_err("alg: acomp: Compression test %d (%d) failed for %s: output len = %d\n", - i + 1, j, algo, reqs[j]->dlen); - ret = -EINVAL; - kfree(input_vec); - goto out; - } + ret = crypto_wait_req(crypto_acomp_decompress(req), &wait); + if (ret) { + pr_err("alg: acomp: compression failed on test %d for %s: ret=%d\n", + i + 1, algo, -ret); + kfree(input_vec); + acomp_request_free(req); + goto out; + } - if (memcmp(input_vec, decomp_out[j], reqs[j]->dlen)) { - pr_err("alg: acomp: Compression test %d (%d) failed for %s\n", - i + 1, j, algo); - hexdump(output[j], reqs[j]->dlen); - ret = -EINVAL; - kfree(input_vec); - goto out; - } + if (req->dlen != ctemplate[i].inlen) { + pr_err("alg: acomp: Compression test %d failed for %s: output len = %d\n", + i + 1, algo, req->dlen); + ret = -EINVAL; + kfree(input_vec); + acomp_request_free(req); + goto out; + } + + if (memcmp(input_vec, decomp_out, req->dlen)) { + pr_err("alg: acomp: Compression test %d failed for %s\n", + i + 1, algo); + hexdump(output, req->dlen); + ret = -EINVAL; + kfree(input_vec); + acomp_request_free(req); + goto out; } kfree(input_vec); + acomp_request_free(req); } for (i = 0; i < dtcount; i++) { @@ -3446,9 +3431,10 @@ static int test_acomp(struct crypto_acomp *tfm, goto out; } + memset(output, 0, dlen); crypto_init_wait(&wait); - sg_init_one(src, input_vec, ilen); - sg_init_one(dst, output[0], dlen); + sg_init_one(&src, input_vec, ilen); + sg_init_one(&dst, output, dlen); req = acomp_request_alloc(tfm); if (!req) { @@ -3459,7 +3445,7 @@ static int test_acomp(struct crypto_acomp *tfm, goto out; } - acomp_request_set_params(req, src, dst, ilen, dlen); + acomp_request_set_params(req, &src, &dst, ilen, dlen); acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, crypto_req_done, &wait); @@ -3481,10 +3467,10 @@ static int test_acomp(struct crypto_acomp *tfm, goto out; } - if (memcmp(output[0], dtemplate[i].output, req->dlen)) { + if (memcmp(output, dtemplate[i].output, req->dlen)) { pr_err("alg: acomp: Decompression test %d failed for %s\n", i + 1, algo); - hexdump(output[0], req->dlen); + hexdump(output, req->dlen); ret = -EINVAL; kfree(input_vec); acomp_request_free(req); @@ -3498,13 +3484,8 @@ static int test_acomp(struct crypto_acomp *tfm, ret = 0; out: - acomp_request_free(reqs[0]); - for (i = 0; i < MAX_MB_MSGS; i++) { - kfree(output[i]); - kfree(decomp_out[i]); - } - kfree(dst); - kfree(src); + kfree(decomp_out); + kfree(output); return ret; } -- cgit v1.2.3 From 408e4504f97c0aa510330f0a04b7ed028fdf3154 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Sat, 19 Apr 2025 22:48:59 +0200 Subject: Revert "hfs{plus}: add deprecation warning" This reverts commit ddee68c499f76ae47c011549df5be53db0057402. There's ongoing discussion about better maintenance of at least hfsplus. Rever the deprecation warning for now. Signed-off-by: Christian Brauner --- fs/hfs/super.c | 2 -- fs/hfsplus/super.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4413cd8feb9e..fe09c2093a93 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -404,8 +404,6 @@ static int hfs_init_fs_context(struct fs_context *fc) { struct hfs_sb_info *hsb; - pr_warn("The hfs filesystem is deprecated and scheduled to be removed from the kernel in 2025\n"); - hsb = kzalloc(sizeof(struct hfs_sb_info), GFP_KERNEL); if (!hsb) return -ENOMEM; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 58cff4b2a3b4..948b8aaee33e 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -656,8 +656,6 @@ static int hfsplus_init_fs_context(struct fs_context *fc) { struct hfsplus_sb_info *sbi; - pr_warn("The hfsplus filesystem is deprecated and scheduled to be removed from the kernel in 2025\n"); - sbi = kzalloc(sizeof(struct hfsplus_sb_info), GFP_KERNEL); if (!sbi) return -ENOMEM; -- cgit v1.2.3 From efabefb05aa1fe534ddb1839980824a763a7f1b0 Mon Sep 17 00:00:00 2001 From: Sahil Siddiq Date: Sat, 19 Apr 2025 21:18:17 +0530 Subject: openrisc: Refactor struct cpuinfo_or1k to reduce duplication The "cpuinfo_or1k" structure currently has identical data members for different cache components. Remove these fields out of struct cpuinfo_or1k and into its own struct. This reduces duplication while keeping cpuinfo_or1k extensible so more cache descriptors can be added in the future. Also add a new field "sets" to the new structure. Signed-off-by: Sahil Siddiq Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/cpuinfo.h | 16 +++++++------ arch/openrisc/kernel/setup.c | 45 ++++++++++++++++++------------------- 2 files changed, 31 insertions(+), 30 deletions(-) diff --git a/arch/openrisc/include/asm/cpuinfo.h b/arch/openrisc/include/asm/cpuinfo.h index 5e4744153d0e..82f5d4c06314 100644 --- a/arch/openrisc/include/asm/cpuinfo.h +++ b/arch/openrisc/include/asm/cpuinfo.h @@ -15,16 +15,18 @@ #ifndef __ASM_OPENRISC_CPUINFO_H #define __ASM_OPENRISC_CPUINFO_H +struct cache_desc { + u32 size; + u32 sets; + u32 block_size; + u32 ways; +}; + struct cpuinfo_or1k { u32 clock_frequency; - u32 icache_size; - u32 icache_block_size; - u32 icache_ways; - - u32 dcache_size; - u32 dcache_block_size; - u32 dcache_ways; + struct cache_desc icache; + struct cache_desc dcache; u16 coreid; }; diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index be56eaafc8b9..66207cd7bb9e 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -115,16 +115,16 @@ static void print_cpuinfo(void) if (upr & SPR_UPR_DCP) printk(KERN_INFO - "-- dcache: %4d bytes total, %2d bytes/line, %d way(s)\n", - cpuinfo->dcache_size, cpuinfo->dcache_block_size, - cpuinfo->dcache_ways); + "-- dcache: %4d bytes total, %2d bytes/line, %d set(s), %d way(s)\n", + cpuinfo->dcache.size, cpuinfo->dcache.block_size, + cpuinfo->dcache.sets, cpuinfo->dcache.ways); else printk(KERN_INFO "-- dcache disabled\n"); if (upr & SPR_UPR_ICP) printk(KERN_INFO - "-- icache: %4d bytes total, %2d bytes/line, %d way(s)\n", - cpuinfo->icache_size, cpuinfo->icache_block_size, - cpuinfo->icache_ways); + "-- icache: %4d bytes total, %2d bytes/line, %d set(s), %d way(s)\n", + cpuinfo->icache.size, cpuinfo->icache.block_size, + cpuinfo->icache.sets, cpuinfo->icache.ways); else printk(KERN_INFO "-- icache disabled\n"); @@ -156,7 +156,6 @@ void __init setup_cpuinfo(void) { struct device_node *cpu; unsigned long iccfgr, dccfgr; - unsigned long cache_set_size; int cpu_id = smp_processor_id(); struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[cpu_id]; @@ -165,18 +164,18 @@ void __init setup_cpuinfo(void) panic("Couldn't find CPU%d in device tree...\n", cpu_id); iccfgr = mfspr(SPR_ICCFGR); - cpuinfo->icache_ways = 1 << (iccfgr & SPR_ICCFGR_NCW); - cache_set_size = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); - cpuinfo->icache_block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); - cpuinfo->icache_size = - cache_set_size * cpuinfo->icache_ways * cpuinfo->icache_block_size; + cpuinfo->icache.ways = 1 << (iccfgr & SPR_ICCFGR_NCW); + cpuinfo->icache.sets = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); + cpuinfo->icache.block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); + cpuinfo->icache.size = + cpuinfo->icache.sets * cpuinfo->icache.ways * cpuinfo->icache.block_size; dccfgr = mfspr(SPR_DCCFGR); - cpuinfo->dcache_ways = 1 << (dccfgr & SPR_DCCFGR_NCW); - cache_set_size = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); - cpuinfo->dcache_block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); - cpuinfo->dcache_size = - cache_set_size * cpuinfo->dcache_ways * cpuinfo->dcache_block_size; + cpuinfo->dcache.ways = 1 << (dccfgr & SPR_DCCFGR_NCW); + cpuinfo->dcache.sets = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); + cpuinfo->dcache.block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); + cpuinfo->dcache.size = + cpuinfo->dcache.sets * cpuinfo->dcache.ways * cpuinfo->dcache.block_size; if (of_property_read_u32(cpu, "clock-frequency", &cpuinfo->clock_frequency)) { @@ -320,14 +319,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "revision\t\t: %d\n", vr & SPR_VR_REV); } seq_printf(m, "frequency\t\t: %ld\n", loops_per_jiffy * HZ); - seq_printf(m, "dcache size\t\t: %d bytes\n", cpuinfo->dcache_size); + seq_printf(m, "dcache size\t\t: %d bytes\n", cpuinfo->dcache.size); seq_printf(m, "dcache block size\t: %d bytes\n", - cpuinfo->dcache_block_size); - seq_printf(m, "dcache ways\t\t: %d\n", cpuinfo->dcache_ways); - seq_printf(m, "icache size\t\t: %d bytes\n", cpuinfo->icache_size); + cpuinfo->dcache.block_size); + seq_printf(m, "dcache ways\t\t: %d\n", cpuinfo->dcache.ways); + seq_printf(m, "icache size\t\t: %d bytes\n", cpuinfo->icache.size); seq_printf(m, "icache block size\t: %d bytes\n", - cpuinfo->icache_block_size); - seq_printf(m, "icache ways\t\t: %d\n", cpuinfo->icache_ways); + cpuinfo->icache.block_size); + seq_printf(m, "icache ways\t\t: %d\n", cpuinfo->icache.ways); seq_printf(m, "immu\t\t\t: %d entries, %lu ways\n", 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW)); -- cgit v1.2.3 From 0c4a6e79ef522554bc509294dfe69b24ee78205d Mon Sep 17 00:00:00 2001 From: Sahil Siddiq Date: Sat, 19 Apr 2025 21:18:18 +0530 Subject: openrisc: Introduce new utility functions to flush and invalidate caches According to the OpenRISC architecture manual, the dcache and icache may not be present. When these caches are present, the invalidate and flush registers may be absent. The current implementation does not perform checks to verify their presence before utilizing cache registers, or invalidating and flushing cache blocks. Introduce new functions to detect the presence of cache components and related special-purpose registers. There are a few places where a range of addresses have to be flushed or invalidated and the implementation is duplicated. Introduce new utility functions and macros that generalize this implementation and reduce duplication. Signed-off-by: Sahil Siddiq Signed-off-by: Stafford Horne --- arch/openrisc/include/asm/cacheflush.h | 17 +++++++++++ arch/openrisc/include/asm/cpuinfo.h | 8 +++++ arch/openrisc/kernel/dma.c | 18 +++-------- arch/openrisc/mm/cache.c | 56 ++++++++++++++++++++++++++++------ arch/openrisc/mm/init.c | 5 +-- 5 files changed, 79 insertions(+), 25 deletions(-) diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 984c331ff5f4..0e60af486ec1 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -23,6 +23,9 @@ */ extern void local_dcache_page_flush(struct page *page); extern void local_icache_page_inv(struct page *page); +extern void local_dcache_range_flush(unsigned long start, unsigned long end); +extern void local_dcache_range_inv(unsigned long start, unsigned long end); +extern void local_icache_range_inv(unsigned long start, unsigned long end); /* * Data cache flushing always happen on the local cpu. Instruction cache @@ -38,6 +41,20 @@ extern void local_icache_page_inv(struct page *page); extern void smp_icache_page_inv(struct page *page); #endif /* CONFIG_SMP */ +/* + * Even if the actual block size is larger than L1_CACHE_BYTES, paddr + * can be incremented by L1_CACHE_BYTES. When paddr is written to the + * invalidate register, the entire cache line encompassing this address + * is invalidated. Each subsequent reference to the same cache line will + * not affect the invalidation process. + */ +#define local_dcache_block_flush(addr) \ + local_dcache_range_flush(addr, addr + L1_CACHE_BYTES) +#define local_dcache_block_inv(addr) \ + local_dcache_range_inv(addr, addr + L1_CACHE_BYTES) +#define local_icache_block_inv(addr) \ + local_icache_range_inv(addr, addr + L1_CACHE_BYTES) + /* * Synchronizes caches. Whenever a cpu writes executable code to memory, this * should be called to make sure the processor sees the newly written code. diff --git a/arch/openrisc/include/asm/cpuinfo.h b/arch/openrisc/include/asm/cpuinfo.h index 82f5d4c06314..3cfc4cf0b019 100644 --- a/arch/openrisc/include/asm/cpuinfo.h +++ b/arch/openrisc/include/asm/cpuinfo.h @@ -15,6 +15,9 @@ #ifndef __ASM_OPENRISC_CPUINFO_H #define __ASM_OPENRISC_CPUINFO_H +#include +#include + struct cache_desc { u32 size; u32 sets; @@ -34,4 +37,9 @@ struct cpuinfo_or1k { extern struct cpuinfo_or1k cpuinfo_or1k[NR_CPUS]; extern void setup_cpuinfo(void); +/* + * Check if the cache component exists. + */ +extern bool cpu_cache_is_present(const unsigned int cache_type); + #endif /* __ASM_OPENRISC_CPUINFO_H */ diff --git a/arch/openrisc/kernel/dma.c b/arch/openrisc/kernel/dma.c index b3edbb33b621..3a7b5baaa450 100644 --- a/arch/openrisc/kernel/dma.c +++ b/arch/openrisc/kernel/dma.c @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -24,9 +25,6 @@ static int page_set_nocache(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - unsigned long cl; - struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; - pte_val(*pte) |= _PAGE_CI; /* @@ -36,8 +34,7 @@ page_set_nocache(pte_t *pte, unsigned long addr, flush_tlb_kernel_range(addr, addr + PAGE_SIZE); /* Flush page out of dcache */ - for (cl = __pa(addr); cl < __pa(next); cl += cpuinfo->dcache_block_size) - mtspr(SPR_DCBFR, cl); + local_dcache_range_flush(__pa(addr), __pa(next)); return 0; } @@ -98,21 +95,14 @@ void arch_dma_clear_uncached(void *cpu_addr, size_t size) void arch_sync_dma_for_device(phys_addr_t addr, size_t size, enum dma_data_direction dir) { - unsigned long cl; - struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; - switch (dir) { case DMA_TO_DEVICE: /* Flush the dcache for the requested range */ - for (cl = addr; cl < addr + size; - cl += cpuinfo->dcache_block_size) - mtspr(SPR_DCBFR, cl); + local_dcache_range_flush(addr, addr + size); break; case DMA_FROM_DEVICE: /* Invalidate the dcache for the requested range */ - for (cl = addr; cl < addr + size; - cl += cpuinfo->dcache_block_size) - mtspr(SPR_DCBIR, cl); + local_dcache_range_inv(addr, addr + size); break; default: /* diff --git a/arch/openrisc/mm/cache.c b/arch/openrisc/mm/cache.c index eb43b73f3855..0f265b8e73ec 100644 --- a/arch/openrisc/mm/cache.c +++ b/arch/openrisc/mm/cache.c @@ -14,31 +14,70 @@ #include #include #include +#include #include -static __always_inline void cache_loop(struct page *page, const unsigned int reg) +/* + * Check if the cache component exists. + */ +bool cpu_cache_is_present(const unsigned int cache_type) { - unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT; - unsigned long line = paddr & ~(L1_CACHE_BYTES - 1); + unsigned long upr = mfspr(SPR_UPR); + unsigned long mask = SPR_UPR_UP | cache_type; + + return !((upr & mask) ^ mask); +} + +static __always_inline void cache_loop(unsigned long paddr, unsigned long end, + const unsigned short reg, const unsigned int cache_type) +{ + if (!cpu_cache_is_present(cache_type)) + return; - while (line < paddr + PAGE_SIZE) { - mtspr(reg, line); - line += L1_CACHE_BYTES; + while (paddr < end) { + mtspr(reg, paddr); + paddr += L1_CACHE_BYTES; } } +static __always_inline void cache_loop_page(struct page *page, const unsigned short reg, + const unsigned int cache_type) +{ + unsigned long paddr = page_to_pfn(page) << PAGE_SHIFT; + unsigned long end = paddr + PAGE_SIZE; + + paddr &= ~(L1_CACHE_BYTES - 1); + + cache_loop(paddr, end, reg, cache_type); +} + void local_dcache_page_flush(struct page *page) { - cache_loop(page, SPR_DCBFR); + cache_loop_page(page, SPR_DCBFR, SPR_UPR_DCP); } EXPORT_SYMBOL(local_dcache_page_flush); void local_icache_page_inv(struct page *page) { - cache_loop(page, SPR_ICBIR); + cache_loop_page(page, SPR_ICBIR, SPR_UPR_ICP); } EXPORT_SYMBOL(local_icache_page_inv); +void local_dcache_range_flush(unsigned long start, unsigned long end) +{ + cache_loop(start, end, SPR_DCBFR, SPR_UPR_DCP); +} + +void local_dcache_range_inv(unsigned long start, unsigned long end) +{ + cache_loop(start, end, SPR_DCBIR, SPR_UPR_DCP); +} + +void local_icache_range_inv(unsigned long start, unsigned long end) +{ + cache_loop(start, end, SPR_ICBIR, SPR_UPR_ICP); +} + void update_cache(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { @@ -58,4 +97,3 @@ void update_cache(struct vm_area_struct *vma, unsigned long address, sync_icache_dcache(folio_page(folio, nr)); } } - diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index d0cb1a0126f9..46b8720db08e 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -35,6 +35,7 @@ #include #include #include +#include int mem_init_done; @@ -176,8 +177,8 @@ void __init paging_init(void) barrier(); /* Invalidate instruction caches after code modification */ - mtspr(SPR_ICBIR, 0x900); - mtspr(SPR_ICBIR, 0xa00); + local_icache_block_inv(0x900); + local_icache_block_inv(0xa00); /* New TLB miss handlers and kernel page tables are in now place. * Make sure that page flags get updated for all pages in TLB by -- cgit v1.2.3 From 4e6d24a309e60251439f08f15de37b489465f17b Mon Sep 17 00:00:00 2001 From: Sahil Siddiq Date: Sat, 19 Apr 2025 21:18:19 +0530 Subject: openrisc: Add cacheinfo support Add cacheinfo support for OpenRISC. Currently, a few CPU cache attributes pertaining to OpenRISC processors are exposed along with other unrelated CPU attributes in the procfs file system (/proc/cpuinfo). However, a few cache attributes remain unexposed. Provide a mechanism that the generic cacheinfo infrastructure can employ to expose these attributes via the sysfs file system. These attributes can then be exposed in /sys/devices/system/cpu/cpuX/cache/indexN. Move the implementation to pull cache attributes from the processor's registers from arch/openrisc/kernel/setup.c with a few modifications. This implementation is based on similar work done for MIPS and LoongArch. Link: https://raw.githubusercontent.com/openrisc/doc/master/openrisc-arch-1.4-rev0.pdf Signed-off-by: Sahil Siddiq Signed-off-by: Stafford Horne --- arch/openrisc/kernel/Makefile | 2 +- arch/openrisc/kernel/cacheinfo.c | 104 +++++++++++++++++++++++++++++++++++++++ arch/openrisc/kernel/setup.c | 44 ++--------------- 3 files changed, 108 insertions(+), 42 deletions(-) create mode 100644 arch/openrisc/kernel/cacheinfo.c diff --git a/arch/openrisc/kernel/Makefile b/arch/openrisc/kernel/Makefile index 79129161f3e0..e4c7d9bdd598 100644 --- a/arch/openrisc/kernel/Makefile +++ b/arch/openrisc/kernel/Makefile @@ -7,7 +7,7 @@ extra-y := vmlinux.lds obj-y := head.o setup.o or32_ksyms.o process.o dma.o \ traps.o time.o irq.o entry.o ptrace.o signal.o \ - sys_call_table.o unwinder.o + sys_call_table.o unwinder.o cacheinfo.o obj-$(CONFIG_SMP) += smp.o sync-timer.o obj-$(CONFIG_STACKTRACE) += stacktrace.o diff --git a/arch/openrisc/kernel/cacheinfo.c b/arch/openrisc/kernel/cacheinfo.c new file mode 100644 index 000000000000..61230545e4ff --- /dev/null +++ b/arch/openrisc/kernel/cacheinfo.c @@ -0,0 +1,104 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * OpenRISC cacheinfo support + * + * Based on work done for MIPS and LoongArch. All original copyrights + * apply as per the original source declaration. + * + * OpenRISC implementation: + * Copyright (C) 2025 Sahil Siddiq + */ + +#include +#include +#include +#include + +static inline void ci_leaf_init(struct cacheinfo *this_leaf, enum cache_type type, + unsigned int level, struct cache_desc *cache, int cpu) +{ + this_leaf->type = type; + this_leaf->level = level; + this_leaf->coherency_line_size = cache->block_size; + this_leaf->number_of_sets = cache->sets; + this_leaf->ways_of_associativity = cache->ways; + this_leaf->size = cache->size; + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); +} + +int init_cache_level(unsigned int cpu) +{ + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + int leaves = 0, levels = 0; + unsigned long upr = mfspr(SPR_UPR); + unsigned long iccfgr, dccfgr; + + if (!(upr & SPR_UPR_UP)) { + printk(KERN_INFO + "-- no UPR register... unable to detect configuration\n"); + return -ENOENT; + } + + if (cpu_cache_is_present(SPR_UPR_DCP)) { + dccfgr = mfspr(SPR_DCCFGR); + cpuinfo->dcache.ways = 1 << (dccfgr & SPR_DCCFGR_NCW); + cpuinfo->dcache.sets = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); + cpuinfo->dcache.block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); + cpuinfo->dcache.size = + cpuinfo->dcache.sets * cpuinfo->dcache.ways * cpuinfo->dcache.block_size; + leaves += 1; + printk(KERN_INFO + "-- dcache: %d bytes total, %d bytes/line, %d set(s), %d way(s)\n", + cpuinfo->dcache.size, cpuinfo->dcache.block_size, + cpuinfo->dcache.sets, cpuinfo->dcache.ways); + } else + printk(KERN_INFO "-- dcache disabled\n"); + + if (cpu_cache_is_present(SPR_UPR_ICP)) { + iccfgr = mfspr(SPR_ICCFGR); + cpuinfo->icache.ways = 1 << (iccfgr & SPR_ICCFGR_NCW); + cpuinfo->icache.sets = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); + cpuinfo->icache.block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); + cpuinfo->icache.size = + cpuinfo->icache.sets * cpuinfo->icache.ways * cpuinfo->icache.block_size; + leaves += 1; + printk(KERN_INFO + "-- icache: %d bytes total, %d bytes/line, %d set(s), %d way(s)\n", + cpuinfo->icache.size, cpuinfo->icache.block_size, + cpuinfo->icache.sets, cpuinfo->icache.ways); + } else + printk(KERN_INFO "-- icache disabled\n"); + + if (!leaves) + return -ENOENT; + + levels = 1; + + this_cpu_ci->num_leaves = leaves; + this_cpu_ci->num_levels = levels; + + return 0; +} + +int populate_cache_leaves(unsigned int cpu) +{ + struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[smp_processor_id()]; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + int level = 1; + + if (cpu_cache_is_present(SPR_UPR_DCP)) { + ci_leaf_init(this_leaf, CACHE_TYPE_DATA, level, &cpuinfo->dcache, cpu); + this_leaf->attributes = ((mfspr(SPR_DCCFGR) & SPR_DCCFGR_CWS) >> 8) ? + CACHE_WRITE_BACK : CACHE_WRITE_THROUGH; + this_leaf++; + } + + if (cpu_cache_is_present(SPR_UPR_ICP)) + ci_leaf_init(this_leaf, CACHE_TYPE_INST, level, &cpuinfo->icache, cpu); + + this_cpu_ci->cpu_map_populated = true; + + return 0; +} diff --git a/arch/openrisc/kernel/setup.c b/arch/openrisc/kernel/setup.c index 66207cd7bb9e..a9fb9cc6779e 100644 --- a/arch/openrisc/kernel/setup.c +++ b/arch/openrisc/kernel/setup.c @@ -113,21 +113,6 @@ static void print_cpuinfo(void) return; } - if (upr & SPR_UPR_DCP) - printk(KERN_INFO - "-- dcache: %4d bytes total, %2d bytes/line, %d set(s), %d way(s)\n", - cpuinfo->dcache.size, cpuinfo->dcache.block_size, - cpuinfo->dcache.sets, cpuinfo->dcache.ways); - else - printk(KERN_INFO "-- dcache disabled\n"); - if (upr & SPR_UPR_ICP) - printk(KERN_INFO - "-- icache: %4d bytes total, %2d bytes/line, %d set(s), %d way(s)\n", - cpuinfo->icache.size, cpuinfo->icache.block_size, - cpuinfo->icache.sets, cpuinfo->icache.ways); - else - printk(KERN_INFO "-- icache disabled\n"); - if (upr & SPR_UPR_DMP) printk(KERN_INFO "-- dmmu: %4d entries, %lu way(s)\n", 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), @@ -155,7 +140,6 @@ static void print_cpuinfo(void) void __init setup_cpuinfo(void) { struct device_node *cpu; - unsigned long iccfgr, dccfgr; int cpu_id = smp_processor_id(); struct cpuinfo_or1k *cpuinfo = &cpuinfo_or1k[cpu_id]; @@ -163,20 +147,6 @@ void __init setup_cpuinfo(void) if (!cpu) panic("Couldn't find CPU%d in device tree...\n", cpu_id); - iccfgr = mfspr(SPR_ICCFGR); - cpuinfo->icache.ways = 1 << (iccfgr & SPR_ICCFGR_NCW); - cpuinfo->icache.sets = 1 << ((iccfgr & SPR_ICCFGR_NCS) >> 3); - cpuinfo->icache.block_size = 16 << ((iccfgr & SPR_ICCFGR_CBS) >> 7); - cpuinfo->icache.size = - cpuinfo->icache.sets * cpuinfo->icache.ways * cpuinfo->icache.block_size; - - dccfgr = mfspr(SPR_DCCFGR); - cpuinfo->dcache.ways = 1 << (dccfgr & SPR_DCCFGR_NCW); - cpuinfo->dcache.sets = 1 << ((dccfgr & SPR_DCCFGR_NCS) >> 3); - cpuinfo->dcache.block_size = 16 << ((dccfgr & SPR_DCCFGR_CBS) >> 7); - cpuinfo->dcache.size = - cpuinfo->dcache.sets * cpuinfo->dcache.ways * cpuinfo->dcache.block_size; - if (of_property_read_u32(cpu, "clock-frequency", &cpuinfo->clock_frequency)) { printk(KERN_WARNING @@ -293,14 +263,14 @@ static int show_cpuinfo(struct seq_file *m, void *v) unsigned int vr, cpucfgr; unsigned int avr; unsigned int version; +#ifdef CONFIG_SMP struct cpuinfo_or1k *cpuinfo = v; + seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid); +#endif vr = mfspr(SPR_VR); cpucfgr = mfspr(SPR_CPUCFGR); -#ifdef CONFIG_SMP - seq_printf(m, "processor\t\t: %d\n", cpuinfo->coreid); -#endif if (vr & SPR_VR_UVRP) { vr = mfspr(SPR_VR2); version = vr & SPR_VR2_VER; @@ -319,14 +289,6 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "revision\t\t: %d\n", vr & SPR_VR_REV); } seq_printf(m, "frequency\t\t: %ld\n", loops_per_jiffy * HZ); - seq_printf(m, "dcache size\t\t: %d bytes\n", cpuinfo->dcache.size); - seq_printf(m, "dcache block size\t: %d bytes\n", - cpuinfo->dcache.block_size); - seq_printf(m, "dcache ways\t\t: %d\n", cpuinfo->dcache.ways); - seq_printf(m, "icache size\t\t: %d bytes\n", cpuinfo->icache.size); - seq_printf(m, "icache block size\t: %d bytes\n", - cpuinfo->icache.block_size); - seq_printf(m, "icache ways\t\t: %d\n", cpuinfo->icache.ways); seq_printf(m, "immu\t\t\t: %d entries, %lu ways\n", 1 << ((mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTS) >> 2), 1 + (mfspr(SPR_DMMUCFGR) & SPR_DMMUCFGR_NTW)); -- cgit v1.2.3 From 20a43732736ac270c35601f7f22a0bcd2db4cba4 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Fri, 28 Feb 2025 21:37:06 +0000 Subject: Documentation: openrisc: Update mailing list The librecores.org mailing list was replaced with vger.kernel.org last year after the old mail server went offline. Update the docs to reflect the new list. Signed-off-by: Stafford Horne --- Documentation/arch/openrisc/openrisc_port.rst | 6 +++--- Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst | 6 +++--- Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/arch/openrisc/openrisc_port.rst b/Documentation/arch/openrisc/openrisc_port.rst index 1565b9546e38..a31ae4960576 100644 --- a/Documentation/arch/openrisc/openrisc_port.rst +++ b/Documentation/arch/openrisc/openrisc_port.rst @@ -7,10 +7,10 @@ target architecture, specifically, is the 32-bit OpenRISC 1000 family (or1k). For information about OpenRISC processors and ongoing development: - ======= ============================= + ======= ============================== website https://openrisc.io - email openrisc@lists.librecores.org - ======= ============================= + email linux-openrisc@vger.kernel.org + ======= ============================== --------------------------------------------------------------------- diff --git a/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst b/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst index cadc580fa23b..d2e4ca8a46c7 100644 --- a/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst +++ b/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst @@ -17,10 +17,10 @@ OpenRISC 1000系列(或1k)。 关于OpenRISC处理器和正在进行中的开发的信息: - ======= ============================= + ======= ============================== 网站 https://openrisc.io - 邮箱 openrisc@lists.librecores.org - ======= ============================= + 邮箱 linux-openrisc@vger.kernel.org + ======= ============================== --------------------------------------------------------------------- diff --git a/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst b/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst index 422fe9f7a3f2..86590b016d56 100644 --- a/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst +++ b/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst @@ -17,10 +17,10 @@ OpenRISC 1000系列(或1k)。 關於OpenRISC處理器和正在進行中的開發的信息: - ======= ============================= + ======= ============================== 網站 https://openrisc.io - 郵箱 openrisc@lists.librecores.org - ======= ============================= + 郵箱 linux-openrisc@vger.kernel.org + ======= ============================== --------------------------------------------------------------------- -- cgit v1.2.3 From 66ffd2f3161124f2f5019b55d8ef3add26a002a5 Mon Sep 17 00:00:00 2001 From: Stafford Horne Date: Thu, 17 Apr 2025 08:36:02 +0100 Subject: Documentation: openrisc: Update toolchain binaries URL The old development toolchain binaries were hosted in the or1k-gcc development github repo release page. However, now that we have all code upstream I cut releases from stable upstream tarballs. It does not make sense to tag the or1k-gcc github repo releases for these stable releases. Update the toolchain binaries URL to point to where they are now hosted on the or1k-toolchain-build github release page. Signed-off-by: Stafford Horne --- Documentation/arch/openrisc/openrisc_port.rst | 6 +++--- Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst | 6 +++--- Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/arch/openrisc/openrisc_port.rst b/Documentation/arch/openrisc/openrisc_port.rst index a31ae4960576..a8f307a3b499 100644 --- a/Documentation/arch/openrisc/openrisc_port.rst +++ b/Documentation/arch/openrisc/openrisc_port.rst @@ -27,11 +27,11 @@ Toolchain binaries can be obtained from openrisc.io or our github releases page. Instructions for building the different toolchains can be found on openrisc.io or Stafford's toolchain build and release scripts. - ========== ================================================= - binaries https://github.com/openrisc/or1k-gcc/releases + ========== ========================================================== + binaries https://github.com/stffrdhrn/or1k-toolchain-build/releases toolchains https://openrisc.io/software building https://github.com/stffrdhrn/or1k-toolchain-build - ========== ================================================= + ========== ========================================================== 2) Building diff --git a/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst b/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst index d2e4ca8a46c7..d728e4db0b85 100644 --- a/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst +++ b/Documentation/translations/zh_CN/arch/openrisc/openrisc_port.rst @@ -36,11 +36,11 @@ OpenRISC工具链和Linux的构建指南 工具链的构建指南可以在openrisc.io或Stafford的工具链构建和发布脚本 中找到。 - ====== ================================================= - 二进制 https://github.com/openrisc/or1k-gcc/releases + ====== ========================================================== + 二进制 https://github.com/stffrdhrn/or1k-toolchain-build/releases 工具链 https://openrisc.io/software 构建 https://github.com/stffrdhrn/or1k-toolchain-build - ====== ================================================= + ====== ========================================================== 2) 构建 diff --git a/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst b/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst index 86590b016d56..a1e4517dc601 100644 --- a/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst +++ b/Documentation/translations/zh_TW/arch/openrisc/openrisc_port.rst @@ -36,11 +36,11 @@ OpenRISC工具鏈和Linux的構建指南 工具鏈的構建指南可以在openrisc.io或Stafford的工具鏈構建和發佈腳本 中找到。 - ====== ================================================= - 二進制 https://github.com/openrisc/or1k-gcc/releases + ====== ========================================================== + 二進制 https://github.com/stffrdhrn/or1k-toolchain-build/releases 工具鏈 https://openrisc.io/software 構建 https://github.com/stffrdhrn/or1k-toolchain-build - ====== ================================================= + ====== ========================================================== 2) 構建 -- cgit v1.2.3 From d5d45a7f26194460964eb5677a9226697f7b7fdd Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 10:33:23 -0700 Subject: gcc-15: make 'unterminated string initialization' just a warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc-15 enabling -Wunterminated-string-initialization in -Wextra by default was done with the best intentions, but the warning is still quite broken. What annoys me about the warning is that this is a very traditional AND CORRECT way to initialize fixed byte arrays in C: unsigned char hex[16] = "0123456789abcdef"; and we use this all over the kernel. And the warning is fine, but gcc developers apparently never made a reasonable way to disable it. As is (sadly) tradition with these things. Yes, there's "__attribute__((nonstring))", and we have a macro to make that absolutely disgusting syntax more palatable (ie the kernel syntax for that monstrosity is just "__nonstring"). But that attribute is misdesigned. What you'd typically want to do is tell the compiler that you are using a type that isn't a string but a byte array, but that doesn't work at all: warning: ‘nonstring’ attribute does not apply to types [-Wattributes] and because of this fundamental mis-design, you then have to mark each instance of that pattern. This is particularly noticeable in our ACPI code, because ACPI has this notion of a 4-byte "type name" that gets used all over, and is exactly this kind of byte array. This is a sad oversight, because the warning is useful, but really would be so much better if gcc had also given a sane way to indicate that we really just want a byte array type at a type level, not the broken "each and every array definition" level. So now instead of creating a nice "ACPI name" type using something like typedef char acpi_name_t[4] __nonstring; we have to do things like char name[ACPI_NAMESEG_SIZE] __nonstring; in every place that uses this concept and then happens to have the typical initializers. This is annoying me mainly because I think the warning _is_ a good warning, which is why I'm not just turning it off in disgust. But it is hampered by this bad implementation detail. [ And obviously I'm doing this now because system upgrades for me are something that happen in the middle of the release cycle: don't do it before or during travel, or just before or during the busy merge window period. ] Signed-off-by: Linus Torvalds --- Makefile | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Makefile b/Makefile index e65f8735c7bf..0a9992db4fe0 100644 --- a/Makefile +++ b/Makefile @@ -1056,6 +1056,9 @@ KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-option, -Wno-stringop-overflow) KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) +#Currently, disable -Wunterminated-string-initialization as an error +KBUILD_CFLAGS += $(call cc-option, -Wno-error=unterminated-string-initialization) + # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow -- cgit v1.2.3 From 4b4bd8c50f4836ba7d3fcfd6c90f96d2605779fe Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 11:02:18 -0700 Subject: gcc-15: acpi: sprinkle random '__nonstring' crumbles around This is not great: I'd much rather introduce a typedef that is a "ACPI name byte buffer", and use that to mark these special 4-byte ACPI names that do not use NUL termination. But as noted in the previous commit ("gcc-15: make 'unterminated string initialization' just a warning") gcc doesn't actually seem to support that notion, so instead you have to just mark every single array declaration individually. So this is not pretty, but this gets rid of the bulk of the annoying warnings during an allmodconfig build for me. Signed-off-by: Linus Torvalds --- drivers/acpi/acpica/aclocal.h | 4 ++-- drivers/acpi/acpica/nsrepair2.c | 2 +- drivers/acpi/tables.c | 2 +- include/acpi/actbl.h | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/acpi/acpica/aclocal.h b/drivers/acpi/acpica/aclocal.h index 6f4fe47c955b..6481c48c22bb 100644 --- a/drivers/acpi/acpica/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -293,7 +293,7 @@ acpi_status (*acpi_internal_method) (struct acpi_walk_state * walk_state); * expected_return_btypes - Allowed type(s) for the return value */ struct acpi_name_info { - char name[ACPI_NAMESEG_SIZE]; + char name[ACPI_NAMESEG_SIZE] __nonstring; u16 argument_list; u8 expected_btypes; }; @@ -370,7 +370,7 @@ typedef acpi_status (*acpi_object_converter) (struct acpi_namespace_node * converted_object); struct acpi_simple_repair_info { - char name[ACPI_NAMESEG_SIZE]; + char name[ACPI_NAMESEG_SIZE] __nonstring; u32 unexpected_btypes; u32 package_index; acpi_object_converter object_converter; diff --git a/drivers/acpi/acpica/nsrepair2.c b/drivers/acpi/acpica/nsrepair2.c index 1bb7b71f07f1..330b5e4711da 100644 --- a/drivers/acpi/acpica/nsrepair2.c +++ b/drivers/acpi/acpica/nsrepair2.c @@ -25,7 +25,7 @@ acpi_status (*acpi_repair_function) (struct acpi_evaluate_info * info, return_object_ptr); typedef struct acpi_repair_info { - char name[ACPI_NAMESEG_SIZE]; + char name[ACPI_NAMESEG_SIZE] __nonstring; acpi_repair_function repair_function; } acpi_repair_info; diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index 2295abbecd14..b5205d464a8a 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -396,7 +396,7 @@ static u8 __init acpi_table_checksum(u8 *buffer, u32 length) } /* All but ACPI_SIG_RSDP and ACPI_SIG_FACS: */ -static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst = { +static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst __nonstring = { ACPI_SIG_BERT, ACPI_SIG_BGRT, ACPI_SIG_CPEP, ACPI_SIG_ECDT, ACPI_SIG_EINJ, ACPI_SIG_ERST, ACPI_SIG_HEST, ACPI_SIG_MADT, ACPI_SIG_MSCT, ACPI_SIG_SBST, ACPI_SIG_SLIT, ACPI_SIG_SRAT, diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 451f6276da49..2fc89704be17 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -66,7 +66,7 @@ ******************************************************************************/ struct acpi_table_header { - char signature[ACPI_NAMESEG_SIZE]; /* ASCII table signature */ + char signature[ACPI_NAMESEG_SIZE] __nonstring; /* ASCII table signature */ u32 length; /* Length of table in bytes, including this header */ u8 revision; /* ACPI Specification minor version number */ u8 checksum; /* To make sum of entire table == 0 */ -- cgit v1.2.3 From be913e7c4034bd7a5cbfc3d53188344dc588d45c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 11:04:00 -0700 Subject: gcc-15: get rid of misc extra NUL character padding This removes two cases of explicit NUL padding that now causes warnings because of '-Wunterminated-string-initialization' being part of -Wextra in gcc-15. Gcc is being silly in this case when it says that it truncates a NUL terminator, because in these cases there were _multiple_ NUL characters. But we can get rid of the warning by just simplifying the two initializers that trigger the warning for me, so this does exactly that. I'm not sure why the power supply code did that odd .attr_name = #_name "\0", pattern: it was introduced in commit 2cabeaf15129 ("power: supply: core: Cleanup power supply sysfs attribute list"), but that 'attr_name[]' field is an explicitly sized character array in a statically initialized variable, and a string initializer always has a terminating NUL _and_ statically initialized character arrays are zero-padded anyway, so it really seems to be rather extraneous belt-and-suspenders. The zero_uuid[16] initialization in drivers/md/bcache/super.c makes perfect sense, but it isn't necessary for the same reasons, and not worth the new gcc warning noise. Signed-off-by: Linus Torvalds --- drivers/md/bcache/super.c | 2 +- drivers/power/supply/power_supply_sysfs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e42f1400cea9..813b38aec3e4 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -546,7 +546,7 @@ static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) static struct uuid_entry *uuid_find_empty(struct cache_set *c) { - static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; + static const char zero_uuid[16] = { 0 }; return uuid_find(c, zero_uuid); } diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c index edb058c19c9c..439dd0bf8644 100644 --- a/drivers/power/supply/power_supply_sysfs.c +++ b/drivers/power/supply/power_supply_sysfs.c @@ -33,7 +33,7 @@ struct power_supply_attr { [POWER_SUPPLY_PROP_ ## _name] = \ { \ .prop_name = #_name, \ - .attr_name = #_name "\0", \ + .attr_name = #_name, \ .text_values = _text, \ .text_values_len = _len, \ } -- cgit v1.2.3 From 05e8d261a34e5c637e37be55c26e42cf5c75ee5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 11:18:55 -0700 Subject: gcc-15: add '__nonstring' markers to byte arrays All of these cases are perfectly valid and good traditional C, but hit by the "you're not NUL-terminating your byte array" warning. And none of the cases want any terminating NUL character. Mark them __nonstring to shut up gcc-15 (and in the case of the ak8974 magnetometer driver, I just removed the explicit array size and let gcc expand the 3-byte and 6-byte arrays by one extra byte, because it was the simpler change). Signed-off-by: Linus Torvalds --- drivers/iio/magnetometer/ak8974.c | 4 ++-- drivers/input/joystick/magellan.c | 2 +- drivers/net/wireless/ath/carl9170/fw.c | 2 +- fs/cachefiles/key.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/iio/magnetometer/ak8974.c b/drivers/iio/magnetometer/ak8974.c index 08975c60e325..7bc341c69697 100644 --- a/drivers/iio/magnetometer/ak8974.c +++ b/drivers/iio/magnetometer/ak8974.c @@ -535,8 +535,8 @@ static int ak8974_detect(struct ak8974 *ak8974) fab_data2, sizeof(fab_data2)); for (i = 0; i < 3; ++i) { - static const char axis[3] = "XYZ"; - static const char pgaxis[6] = "ZYZXYX"; + static const char axis[] = "XYZ"; + static const char pgaxis[] = "ZYZXYX"; unsigned offz = le16_to_cpu(fab_data2[i]) & 0x7F; unsigned fine = le16_to_cpu(fab_data1[i]); unsigned sens = le16_to_cpu(fab_data1[i + 3]); diff --git a/drivers/input/joystick/magellan.c b/drivers/input/joystick/magellan.c index 2eaa25c9c68c..d73389af4dd5 100644 --- a/drivers/input/joystick/magellan.c +++ b/drivers/input/joystick/magellan.c @@ -48,7 +48,7 @@ struct magellan { static int magellan_crunch_nibbles(unsigned char *data, int count) { - static unsigned char nibbles[16] = "0AB3D56GH9:K #include "internal.h" -static const char cachefiles_charmap[64] = +static const char cachefiles_charmap[64] __nonstring = "0123456789" /* 0 - 9 */ "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ -- cgit v1.2.3 From ac71fabf15679fc7bc56c51bc92bd4b626564c37 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 11:30:11 -0700 Subject: gcc-15: work around sequence-point warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The C sequence points are complicated things, and gcc-15 has apparently added a warning for the case where an object is both used and modified multiple times within the same sequence point. That's a great warning. Or rather, it would be a great warning, except gcc-15 seems to not really be very exact about it, and doesn't notice that the modification are to two entirely different members of the same object: the array counter and the array entries. So that seems kind of silly. That said, the code that gcc complains about is unnecessarily complicated, so moving the array counter update into a separate statement seems like the most straightforward fix for these warnings: drivers/net/wireless/intel/iwlwifi/mld/d3.c: In function ‘iwl_mld_set_netdetect_info’: drivers/net/wireless/intel/iwlwifi/mld/d3.c:1102:66: error: operation on ‘netdetect_info->n_matches’ may be undefined [-Werror=sequence-point] 1102 | netdetect_info->matches[netdetect_info->n_matches++] = match; | ~~~~~~~~~~~~~~~~~~~~~~~~~^~ drivers/net/wireless/intel/iwlwifi/mld/d3.c:1120:58: error: operation on ‘match->n_channels’ may be undefined [-Werror=sequence-point] 1120 | match->channels[match->n_channels++] = | ~~~~~~~~~~~~~~~~~^~ side note: the code at that second warning is actively buggy, and only works on little-endian machines that don't do strict alignment checks. The code casts an array of integers into an array of unsigned long in order to use our bitmap iterators. That happens to work fine on any sane architecture, but it's still wrong. This does *not* fix that more serious problem. This only splits the two assignments into two statements and fixes the compiler warning. I need to get rid of the new warnings in order to be able to actually do any build testing. Signed-off-by: Linus Torvalds --- drivers/net/wireless/intel/iwlwifi/mld/d3.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mld/d3.c b/drivers/net/wireless/intel/iwlwifi/mld/d3.c index 2c6e8ecd93b7..ee99298eebf5 100644 --- a/drivers/net/wireless/intel/iwlwifi/mld/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mld/d3.c @@ -1099,7 +1099,8 @@ iwl_mld_set_netdetect_info(struct iwl_mld *mld, if (!match) return; - netdetect_info->matches[netdetect_info->n_matches++] = match; + netdetect_info->matches[netdetect_info->n_matches] = match; + netdetect_info->n_matches++; /* We inverted the order of the SSIDs in the scan * request, so invert the index here. @@ -1116,9 +1117,11 @@ iwl_mld_set_netdetect_info(struct iwl_mld *mld, for_each_set_bit(j, (unsigned long *)&matches[i].matching_channels[0], - sizeof(matches[i].matching_channels)) - match->channels[match->n_channels++] = + sizeof(matches[i].matching_channels)) { + match->channels[match->n_channels] = netdetect_cfg->channels[j]->center_freq; + match->n_channels++; + } } } -- cgit v1.2.3 From 9c32cda43eb78f78c73aee4aa344b777714e259b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 13:43:47 -0700 Subject: Linux 6.15-rc3 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0a9992db4fe0..3dcad2319662 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc2 +EXTRAVERSION = -rc3 NAME = Baby Opossum Posse # *DOCUMENTATION* -- cgit v1.2.3 From 9d7a0577c9db35c4cc52db90bc415ea248446472 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Apr 2025 15:30:53 -0700 Subject: gcc-15: disable '-Wunterminated-string-initialization' entirely for now I had left the warning around but as a non-fatal error to get my gcc-15 builds going, but fixed up some of the most annoying warning cases so that it wouldn't be *too* verbose. Because I like the _concept_ of the warning, even if I detested the implementation to shut it up. It turns out the implementation to shut it up is even more broken than I thought, and my "shut up most of the warnings" patch just caused fatal errors on gcc-14 instead. I had tested with clang, but when I upgrade my development environment, I try to do it on all machines because I hate having different systems to maintain, and hadn't realized that gcc-14 now had issues. The ACPI case is literally why I wanted to have a *type* that doesn't trigger the warning (see commit d5d45a7f2619: "gcc-15: make 'unterminated string initialization' just a warning"), instead of marking individual places as "__nonstring". But gcc-14 doesn't like that __nonstring location that shut gcc-15 up, because it's on an array of char arrays, not on one single array: drivers/acpi/tables.c:399:1: error: 'nonstring' attribute ignored on objects of type 'const char[][4]' [-Werror=attributes] 399 | static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst __nonstring = { | ^~~~~~ and my attempts to nest it properly with a type had failed, because of how gcc doesn't like marking the types as having attributes, only symbols. There may be some trick to it, but I was already annoyed by the bad attribute design, now I'm just entirely fed up with it. I wish gcc had a proper way to say "this type is a *byte* array, not a string". The obvious thing would be to distinguish between "char []" and an explicitly signed "unsigned char []" (as opposed to an implicitly unsigned char, which is typically an architecture-specific default, but for the kernel is universal thanks to '-funsigned-char'). But any "we can typedef a 8-bit type to not become a string just because it's an array" model would be fine. But "__attribute__((nonstring))" is sadly not that sane model. Reported-by: Chris Clayton Fixes: 4b4bd8c50f48 ("gcc-15: acpi: sprinkle random '__nonstring' crumbles around") Fixes: d5d45a7f2619 ("gcc-15: make 'unterminated string initialization' just a warning") Signed-off-by: Linus Torvalds --- Makefile | 4 ++-- drivers/acpi/tables.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile b/Makefile index 3dcad2319662..e94bbb2298c8 100644 --- a/Makefile +++ b/Makefile @@ -1056,8 +1056,8 @@ KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-option, -Wno-stringop-overflow) KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) -#Currently, disable -Wunterminated-string-initialization as an error -KBUILD_CFLAGS += $(call cc-option, -Wno-error=unterminated-string-initialization) +#Currently, disable -Wunterminated-string-initialization as broken +KBUILD_CFLAGS += $(call cc-option, -Wno-unterminated-string-initialization) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/drivers/acpi/tables.c b/drivers/acpi/tables.c index b5205d464a8a..2295abbecd14 100644 --- a/drivers/acpi/tables.c +++ b/drivers/acpi/tables.c @@ -396,7 +396,7 @@ static u8 __init acpi_table_checksum(u8 *buffer, u32 length) } /* All but ACPI_SIG_RSDP and ACPI_SIG_FACS: */ -static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst __nonstring = { +static const char table_sigs[][ACPI_NAMESEG_SIZE] __initconst = { ACPI_SIG_BERT, ACPI_SIG_BGRT, ACPI_SIG_CPEP, ACPI_SIG_ECDT, ACPI_SIG_EINJ, ACPI_SIG_ERST, ACPI_SIG_HEST, ACPI_SIG_MADT, ACPI_SIG_MSCT, ACPI_SIG_SBST, ACPI_SIG_SLIT, ACPI_SIG_SRAT, -- cgit v1.2.3 From 4c0d2c67ac6d54ba71bb3438147b144c25fdee2c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 17 Apr 2025 20:30:18 -0400 Subject: bcachefs: Fix early startup error path Don't set JOURNAL_running until we're also calling journal_space_available() for the first time. If JOURNAL_running is set, shutdown will write an empty journal entry - but this will hit an assert in journal_entry_open() if we've never called journal_space_available(). Reported-by: syzbot+53bb24d476ef8368a7f0@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 17 +++++++++++++++-- fs/bcachefs/journal.h | 7 +------ fs/bcachefs/recovery.c | 6 +++--- fs/bcachefs/super.c | 19 +++++++++---------- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index d8f74b6d0a75..84cb74ba91e6 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1462,8 +1462,6 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); - - set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; j->reservations.idx = journal_cur_seq(j); @@ -1474,6 +1472,21 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) return 0; } +void bch2_journal_set_replay_done(struct journal *j) +{ + /* + * journal_space_available must happen before setting JOURNAL_running + * JOURNAL_running must happen before JOURNAL_replay_done + */ + spin_lock(&j->lock); + bch2_journal_space_available(j); + + set_bit(JOURNAL_need_flush_write, &j->flags); + set_bit(JOURNAL_running, &j->flags); + set_bit(JOURNAL_replay_done, &j->flags); + spin_unlock(&j->lock); +} + /* init/exit: */ void bch2_dev_journal_exit(struct bch_dev *ca) diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 47828771f9c2..641e20c05a14 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -437,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j) struct bch_dev; -static inline void bch2_journal_set_replay_done(struct journal *j) -{ - BUG_ON(!test_bit(JOURNAL_running, &j->flags)); - set_bit(JOURNAL_replay_done, &j->flags); -} - void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); @@ -459,6 +453,7 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); int bch2_fs_journal_start(struct journal *, u64); +void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 606d684e6f23..bea578e33988 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -1129,13 +1129,13 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - set_bit(BCH_FS_accounting_replay_done, &c->flags); - bch2_journal_set_replay_done(&c->journal); - ret = bch2_fs_read_write_early(c); if (ret) goto err; + set_bit(BCH_FS_accounting_replay_done, &c->flags); + bch2_journal_set_replay_done(&c->journal); + for_each_member_device(c, ca) { ret = bch2_dev_usage_init(ca, false); if (ret) { diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index e8a17ed1615d..a8bc02540cce 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -466,29 +466,28 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_clean_shutdown, &c->flags); + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { + bch2_dev_allocator_add(c, ca); + percpu_ref_reinit(&ca->io_ref[WRITE]); + } + bch2_recalc_capacity(c); + /* * First journal write must be a flush write: after a clean shutdown we * don't read the journal, so the first journal write may end up * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ + spin_lock(&c->journal.lock); set_bit(JOURNAL_need_flush_write, &c->journal.flags); set_bit(JOURNAL_running, &c->journal.flags); - - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), READ) { - bch2_dev_allocator_add(c, ca); - percpu_ref_reinit(&ca->io_ref[WRITE]); - } - bch2_recalc_capacity(c); + bch2_journal_space_available(&c->journal); + spin_unlock(&c->journal.lock); ret = bch2_fs_mark_dirty(c); if (ret) goto err; - spin_lock(&c->journal.lock); - bch2_journal_space_available(&c->journal); - spin_unlock(&c->journal.lock); - ret = bch2_journal_reclaim_start(&c->journal); if (ret) goto err; -- cgit v1.2.3 From aa6a591f0fd740e27c54110f8425b53133ad4165 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 17 Apr 2025 22:38:14 -0400 Subject: bcachefs: Fix null ptr deref in bch2_snapshot_tree_oldest_subvol() Reported-by: syzbot+baee8591f336cab0958b@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/snapshot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index b7de29aed839..fec569c7deb1 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -396,7 +396,7 @@ u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) u32 subvol = 0, s; rcu_read_lock(); - while (id) { + while (id && bch2_snapshot_exists(c, id)) { s = snapshot_t(c, id)->subvol; if (s && (!subvol || s < subvol)) -- cgit v1.2.3 From 417f01e726036b564e2e14c39b2be58e93bf7971 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 17 Apr 2025 22:44:16 -0400 Subject: bcachefs: Error ratelimiting is no longer only during fsck We now more often do repair automatically, without the user invoking fsck - and sometimes that can involve fixing lots of errors, so let's avoid flooding the dmesg log. Signed-off-by: Kent Overstreet --- fs/bcachefs/error.c | 17 ++++++++++++----- fs/bcachefs/error.h | 1 + fs/bcachefs/super.c | 1 + 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index baf5dfb32298..925b0b54ea2f 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -272,9 +272,6 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, { struct fsck_err_state *s; - if (!test_bit(BCH_FS_fsck_running, &c->flags)) - return NULL; - list_for_each_entry(s, &c->fsck_error_msgs, list) if (s->id == id) { /* @@ -639,14 +636,14 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, return ret; } -void bch2_flush_fsck_errs(struct bch_fs *c) +static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) { struct fsck_err_state *s, *n; mutex_lock(&c->fsck_error_msgs_lock); list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { - if (s->ratelimited && s->last_msg) + if (print && s->ratelimited && s->last_msg) bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); list_del(&s->list); @@ -657,6 +654,16 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } +void bch2_flush_fsck_errs(struct bch_fs *c) +{ + __bch2_flush_fsck_errs(c, true); +} + +void bch2_free_fsck_errs(struct bch_fs *c) +{ + __bch2_flush_fsck_errs(c, false); +} + int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum, u64 offset) { diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index d0d024dc714b..4a364fd44abe 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -93,6 +93,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) void bch2_flush_fsck_errs(struct bch_fs *); +void bch2_free_fsck_errs(struct bch_fs *); #define fsck_err_wrap(_do) \ ({ \ diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index a8bc02540cce..adbf3bbed6bc 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -552,6 +552,7 @@ static void __bch2_fs_free(struct bch_fs *c) bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); + bch2_free_fsck_errs(c); bch2_fs_accounting_exit(c); bch2_fs_sb_errors_exit(c); bch2_fs_counters_exit(c); -- cgit v1.2.3 From 71f8e806a5e4edada72456ee3b2e2d7eab6fadee Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 12:49:00 -0400 Subject: bcachefs: Stricter checks on "key allowed in this btree" Syzbot managed to come up with a filesystem where check/repair got rather confused at finding a reflink pointer in the inodes btree. Currently, the "key allowed in this btree" checks only apply at commit time, not read time - for forwards compatibility. It seems this is too loose. Now, strict key type allowed checks apply: - at commit time (no forward compatibility issues) - for btree node pointers - if it's a known btree, known key type, and the key type has the "BKEY_TYPE_strict_btree_checks" flag. This means we still have the option of using generic key types - e.g. KEY_TYPE_error, KEY_TYPE_set - on more existing btrees in the future, while most key types that are intended for only a specific btree get stricter checks. Reported-by: syzbot+baee8591f336cab0958b@syzkaller.appspotmail.com Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 80 +++++++++++++++++++++++-------------------- fs/bcachefs/bkey_methods.c | 24 ++++++++++--- 2 files changed, 62 insertions(+), 42 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index a3db328dee31..377f04d92a14 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k) #define __BKEY_PADDED(key, pad) \ struct bkey_i key; __u64 key ## _pad[pad] +enum bch_bkey_type_flags { + BKEY_TYPE_strict_btree_checks = BIT(0), +}; + /* * - DELETED keys are used internally to mark keys that should be ignored but * override keys in composition order. Their version number is ignored. @@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k) * * - WHITEOUT: for hash table btrees */ -#define BCH_BKEY_TYPES() \ - x(deleted, 0) \ - x(whiteout, 1) \ - x(error, 2) \ - x(cookie, 3) \ - x(hash_whiteout, 4) \ - x(btree_ptr, 5) \ - x(extent, 6) \ - x(reservation, 7) \ - x(inode, 8) \ - x(inode_generation, 9) \ - x(dirent, 10) \ - x(xattr, 11) \ - x(alloc, 12) \ - x(quota, 13) \ - x(stripe, 14) \ - x(reflink_p, 15) \ - x(reflink_v, 16) \ - x(inline_data, 17) \ - x(btree_ptr_v2, 18) \ - x(indirect_inline_data, 19) \ - x(alloc_v2, 20) \ - x(subvolume, 21) \ - x(snapshot, 22) \ - x(inode_v2, 23) \ - x(alloc_v3, 24) \ - x(set, 25) \ - x(lru, 26) \ - x(alloc_v4, 27) \ - x(backpointer, 28) \ - x(inode_v3, 29) \ - x(bucket_gens, 30) \ - x(snapshot_tree, 31) \ - x(logged_op_truncate, 32) \ - x(logged_op_finsert, 33) \ - x(accounting, 34) \ - x(inode_alloc_cursor, 35) +#define BCH_BKEY_TYPES() \ + x(deleted, 0, 0) \ + x(whiteout, 1, 0) \ + x(error, 2, 0) \ + x(cookie, 3, 0) \ + x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \ + x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \ + x(extent, 6, BKEY_TYPE_strict_btree_checks) \ + x(reservation, 7, BKEY_TYPE_strict_btree_checks) \ + x(inode, 8, BKEY_TYPE_strict_btree_checks) \ + x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \ + x(dirent, 10, BKEY_TYPE_strict_btree_checks) \ + x(xattr, 11, BKEY_TYPE_strict_btree_checks) \ + x(alloc, 12, BKEY_TYPE_strict_btree_checks) \ + x(quota, 13, BKEY_TYPE_strict_btree_checks) \ + x(stripe, 14, BKEY_TYPE_strict_btree_checks) \ + x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \ + x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \ + x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \ + x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \ + x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \ + x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \ + x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \ + x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \ + x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \ + x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \ + x(set, 25, 0) \ + x(lru, 26, BKEY_TYPE_strict_btree_checks) \ + x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \ + x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \ + x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \ + x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \ + x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \ + x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ + x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ + x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ + x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) enum bch_bkey_type { -#define x(name, nr) KEY_TYPE_##name = nr, +#define x(name, nr, ...) KEY_TYPE_##name = nr, BCH_BKEY_TYPES() #undef x KEY_TYPE_MAX, diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 15c93576b5c2..00d05ccfaf73 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -21,7 +21,7 @@ #include "xattr.h" const char * const bch2_bkey_types[] = { -#define x(name, nr) #name, +#define x(name, nr, ...) #name, BCH_BKEY_TYPES() #undef x NULL @@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ }) const struct bkey_ops bch2_bkey_ops[] = { -#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, +#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() #undef x }; @@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = { #undef x }; +static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { +#define x(name, nr, flags) [KEY_TYPE_##name] = flags, + BCH_BKEY_TYPES() +#undef x +}; + const char *bch2_btree_node_type_str(enum btree_node_type type) { return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); @@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && + enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX + ? bch2_bkey_type_flags[k.k->type] + : 0; + + bool strict_key_type_allowed = + (from.flags & BCH_VALIDATE_commit) || + type == BKEY_TYPE_btree || + (from.btree < BTREE_ID_NR && + (bkey_flags & BKEY_TYPE_strict_btree_checks)); + + bkey_fsck_err_on(strict_key_type_allowed && + k.k->type < KEY_TYPE_MAX && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", -- cgit v1.2.3 From 6468aef231890806ccc4e921b111ff9275880832 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 16:42:50 -0400 Subject: bcachefs: Ensure journal space is block size aligned We don't require that bucket size is block size aligned (although it should be!) - so we need to handle this in the journal code. This fixes an assertion pop in jorunal_entry_close(), where the journal entry overruns available space - after rounding it up to block size. Fixes: https://github.com/koverstreet/bcachefs/issues/854 Signed-off-by: Kent Overstreet --- fs/bcachefs/journal_reclaim.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 5d1547aa118a..ea670c3c43d8 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -252,7 +252,10 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: - j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_sectors = !ret + ? round_down(j->space[journal_space_discarded].next_entry, + block_sectors(c)) + : 0; j->cur_entry_error = ret; if (!ret) -- cgit v1.2.3 From 4c327d03d7c9d5e815a2aada112c442e4a2f8665 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 18 Apr 2025 13:38:23 -0400 Subject: bcachefs: Change __journal_entry_close() assert to ERO We've got some reports of this happening in the wild, and need a bit more info to debug it: https://github.com/koverstreet/bcachefs/issues/854 https://www.reddit.com/r/bcachefs/comments/1k28kjm/surprise_soft_lockup/ Signed-off-by: Kent Overstreet --- fs/bcachefs/journal.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 84cb74ba91e6..bb45d3634194 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -281,7 +281,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; - BUG_ON(sectors > buf->sectors); + if (unlikely(sectors > buf->sectors)) { + struct printbuf err = PRINTBUF; + err.atomic++; + + prt_printf(&err, "journal entry overran reserved space: %u > %u\n", + sectors, buf->sectors); + prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", + le32_to_cpu(buf->data->u64s), buf->u64s_reserved, + j->cur_entry_u64s, + c->block_bits); + prt_printf(&err, "fatal error - emergency read only"); + bch2_journal_halt_locked(j); + + bch_err(c, "%s", err.buf); + printbuf_exit(&err); + return; + } + buf->sectors = sectors; /* -- cgit v1.2.3 From bfbb76ec9808ce80e661dae77018b77488bb56d0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 19 Apr 2025 02:50:29 -0400 Subject: bcachefs: Fix ref leak in write_super() found with the new enumerated_ref code Signed-off-by: Kent Overstreet --- fs/bcachefs/super-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 25b6bce05c3c..cb5d960aed92 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1102,7 +1102,8 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, ")"); bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_sb_not_downgraded; + ret = -BCH_ERR_sb_not_downgraded; + goto out; } darray_for_each(online_devices, ca) { -- cgit v1.2.3 From 10e42b6f25995c6ce7c462e1bcb9684acc0f09a0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 15 Apr 2025 10:26:05 -0400 Subject: bcachefs: bch2_copygc_wakeup() Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.h | 9 +++++++++ fs/bcachefs/rebalance.c | 4 ++-- fs/bcachefs/rebalance.h | 2 +- fs/bcachefs/super.c | 4 ++-- fs/bcachefs/sysfs.c | 7 +++---- 5 files changed, 17 insertions(+), 9 deletions(-) diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index ea181fef5bc9..d1885cf67a45 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -5,6 +5,15 @@ unsigned long bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); +static inline void bch2_copygc_wakeup(struct bch_fs *c) +{ + rcu_read_lock(); + struct task_struct *p = rcu_dereference(c->copygc_thread); + if (p) + wake_up_process(p); + rcu_read_unlock(); +} + void bch2_copygc_stop(struct bch_fs *); int bch2_copygc_start(struct bch_fs *); void bch2_fs_copygc_init(struct bch_fs *); diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index c63fa53f30d2..39006e6affe3 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -262,7 +262,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_set_rebalance_needs_scan_trans(trans, inum)); - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); return ret; } @@ -664,7 +664,7 @@ void bch2_rebalance_stop(struct bch_fs *c) c->rebalance.thread = NULL; if (p) { - /* for sychronizing with rebalance_wakeup() */ + /* for sychronizing with bch2_rebalance_wakeup() */ synchronize_rcu(); kthread_stop(p); diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 62a3859d3823..e5e8eb4a2dd1 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); int bch2_set_fs_needs_rebalance(struct bch_fs *); -static inline void rebalance_wakeup(struct bch_fs *c) +static inline void bch2_rebalance_wakeup(struct bch_fs *c) { struct task_struct *p; diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index adbf3bbed6bc..ee27734f350f 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1500,7 +1500,7 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) printbuf_exit(&name); - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); return 0; } @@ -1646,7 +1646,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); return ret; } diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index e5f003c29369..82ee333ddd21 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -654,11 +654,10 @@ static ssize_t sysfs_opt_store(struct bch_fs *c, bch2_set_rebalance_needs_scan(c, 0); if (v && id == Opt_rebalance_enabled) - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); - if (v && id == Opt_copygc_enabled && - c->copygc_thread) - wake_up_process(c->copygc_thread); + if (v && id == Opt_copygc_enabled) + bch2_copygc_wakeup(c); if (id == Opt_discard && !ca) { mutex_lock(&c->sb_lock); -- cgit v1.2.3 From 078d3ee7c162cd66d76171579c02d7890bd77daf Mon Sep 17 00:00:00 2001 From: Smita Koralahalli Date: Mon, 7 Apr 2025 19:27:34 +0000 Subject: cxl/core/regs.c: Skip Memory Space Enable check for RCD and RCH Ports According to CXL r3.2 section 8.2.1.2, the PCI_COMMAND register fields, including Memory Space Enable bit, have no effect on the behavior of an RCD Upstream Port. Retaining this check may incorrectly cause cxl_pci_probe() to fail on a valid RCD upstream Port. While the specification is explicit only for RCD Upstream Ports, this check is solely for accessing the RCRB, which is always mapped through memory space. Therefore, its safe to remove the check entirely. In practice, firmware reliably enables the Memory Space Enable bit for RCH Downstream Ports and no failures have been observed. Removing the check simplifies the code and avoids unnecessary special-casing, while relying on BIOS/firmware to configure devices correctly. Moreover, any failures due to inaccessible RCRB regions will still be caught either in __rcrb_to_component() or while parsing the component register block. The following failure was observed in dmesg when the check was present: cxl_pci 0000:7f:00.0: No component registers (-6) Fixes: d5b1a27143cb ("cxl/acpi: Extract component registers of restricted hosts from RCRB") Signed-off-by: Smita Koralahalli Cc: Reviewed-by: Ira Weiny Reviewed-by: Terry Bowman Reviewed-by: Dave Jiang Reviewed-by: Robert Richter Link: https://patch.msgid.link/20250407192734.70631-1-Smita.KoralahalliChannabasappa@amd.com Signed-off-by: Dave Jiang --- drivers/cxl/core/regs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/cxl/core/regs.c b/drivers/cxl/core/regs.c index 117c2e94c761..5ca7b0eed568 100644 --- a/drivers/cxl/core/regs.c +++ b/drivers/cxl/core/regs.c @@ -581,7 +581,6 @@ resource_size_t __rcrb_to_component(struct device *dev, struct cxl_rcrb_info *ri resource_size_t rcrb = ri->base; void __iomem *addr; u32 bar0, bar1; - u16 cmd; u32 id; if (which == CXL_RCRB_UPSTREAM) @@ -603,7 +602,6 @@ resource_size_t __rcrb_to_component(struct device *dev, struct cxl_rcrb_info *ri } id = readl(addr + PCI_VENDOR_ID); - cmd = readw(addr + PCI_COMMAND); bar0 = readl(addr + PCI_BASE_ADDRESS_0); bar1 = readl(addr + PCI_BASE_ADDRESS_1); iounmap(addr); @@ -618,8 +616,6 @@ resource_size_t __rcrb_to_component(struct device *dev, struct cxl_rcrb_info *ri dev_err(dev, "Failed to access Downstream Port RCRB\n"); return CXL_RESOURCE_NONE; } - if (!(cmd & PCI_COMMAND_MEMORY)) - return CXL_RESOURCE_NONE; /* The RCRB is a Memory Window, and the MEM_TYPE_1M bit is obsolete */ if (bar0 & (PCI_BASE_ADDRESS_MEM_TYPE_1M | PCI_BASE_ADDRESS_SPACE_IO)) return CXL_RESOURCE_NONE; -- cgit v1.2.3 From d64e8e842bb18403d03a85b29caa5cb5f3bd4c7d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 21 Apr 2025 11:52:35 -0400 Subject: bcachefs: Refactor bch2_run_recovery_passes() Don't use a continue; this simplifies the next patch where run_recovery_passes() will be responsible for waking up copygc and rebalance at the appropriate time. Signed-off-by: Kent Overstreet --- fs/bcachefs/recovery_passes.c | 63 +++++++++++++++++++++---------------------- 1 file changed, 30 insertions(+), 33 deletions(-) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 593ff142530d..8f769804cb59 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -12,6 +12,7 @@ #include "journal.h" #include "lru.h" #include "logged_ops.h" +#include "movinggc.h" #include "rebalance.h" #include "recovery.h" #include "recovery_passes.h" @@ -262,49 +263,45 @@ int bch2_run_recovery_passes(struct bch_fs *c) */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - c->next_recovery_pass = c->curr_recovery_pass + 1; + spin_lock_irq(&c->recovery_pass_lock); - spin_lock_irq(&c->recovery_pass_lock); + while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { unsigned pass = c->curr_recovery_pass; + c->next_recovery_pass = pass + 1; + if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) { - spin_unlock_irq(&c->recovery_pass_lock); + c->curr_recovery_pass > c->opts.recovery_pass_last) break; - } - if (!should_run_recovery_pass(c, pass)) { - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, pass); + if (should_run_recovery_pass(c, pass)) { spin_unlock_irq(&c->recovery_pass_lock); - continue; - } - spin_unlock_irq(&c->recovery_pass_lock); - - ret = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); - - if (!ret && !test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); - - spin_lock_irq(&c->recovery_pass_lock); - if (c->next_recovery_pass < c->curr_recovery_pass) { - /* - * bch2_run_explicit_recovery_pass() was called: we - * can't always catch -BCH_ERR_restart_recovery because - * it may have been called from another thread (btree - * node read completion) - */ - ret = 0; - c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); - } else { - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); + ret = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); + + if (!ret && !test_bit(BCH_FS_error, &c->flags)) + bch2_clear_recovery_pass_required(c, pass); + spin_lock_irq(&c->recovery_pass_lock); + + if (c->next_recovery_pass < c->curr_recovery_pass) { + /* + * bch2_run_explicit_recovery_pass() was called: we + * can't always catch -BCH_ERR_restart_recovery because + * it may have been called from another thread (btree + * node read completion) + */ + ret = 0; + c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); + } else { + c->recovery_passes_complete |= BIT_ULL(pass); + c->recovery_pass_done = max(c->recovery_pass_done, pass); + } } + c->curr_recovery_pass = c->next_recovery_pass; - spin_unlock_irq(&c->recovery_pass_lock); } + spin_unlock_irq(&c->recovery_pass_lock); + return ret; } -- cgit v1.2.3 From 387df331298eaa9d6dbb4e30f376d632dd798b46 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 13 Apr 2025 06:44:23 -0400 Subject: bcachefs: Start copygc, rebalance threads earlier Previously, copygc and rebalance weren't started until the very end of mounting, after all recvoery passes have finished. But copygc really should be started earlier, since it may be needed for allocations to make forward progress. Additionally, we've been seeing occasional bug reports where starting the kthread fails due to a pending signal - i.e. we're getting timed out by systemd (during a version upgrade), but we're not seeing the signal until mount is about to complete. Additionally, we now have copygc/rebalance explicitly wait for check_snapshots to complete (if being run); they require that for snapshot_is_ancestor() in the data move path. Signed-off-by: Kent Overstreet --- fs/bcachefs/movinggc.c | 7 ++++++ fs/bcachefs/rebalance.c | 7 ++++++ fs/bcachefs/recovery.c | 4 ++++ fs/bcachefs/recovery_passes.c | 7 ++++++ fs/bcachefs/super.c | 50 ++++++++++++------------------------------- 5 files changed, 39 insertions(+), 36 deletions(-) diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 159410c50861..96873372b516 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -356,6 +356,13 @@ static int bch2_copygc_thread(void *arg) set_freezable(); + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + */ + kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_should_stop()); + bch2_move_stats_init(&move_stats, "copygc"); bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, writepoint_ptr(&c->copygc_write_point), diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index 39006e6affe3..4ccdfc1f34aa 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -581,6 +581,13 @@ static int bch2_rebalance_thread(void *arg) set_freezable(); + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + */ + kthread_wait_freezable(c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_should_stop()); + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, writepoint_ptr(&c->rebalance_write_point), true); diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index bea578e33988..d6c4ef819d40 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -18,6 +18,7 @@ #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "movinggc.h" #include "namei.h" #include "quota.h" #include "rebalance.h" @@ -1194,6 +1195,9 @@ int bch2_fs_initialize(struct bch_fs *c) c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); + if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); if (ret) diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 8f769804cb59..22f72bb5b853 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -266,6 +266,7 @@ int bch2_run_recovery_passes(struct bch_fs *c) spin_lock_irq(&c->recovery_pass_lock); while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { + unsigned prev_done = c->recovery_pass_done; unsigned pass = c->curr_recovery_pass; c->next_recovery_pass = pass + 1; @@ -299,6 +300,12 @@ int bch2_run_recovery_passes(struct bch_fs *c) } c->curr_recovery_pass = c->next_recovery_pass; + + if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && + c->recovery_pass_done > BCH_RECOVERY_PASS_check_snapshots) { + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); + } } spin_unlock_irq(&c->recovery_pass_lock); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index ee27734f350f..4060af4692f9 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -418,32 +418,6 @@ bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) return ret; } -static int bch2_fs_read_write_late(struct bch_fs *c) -{ - int ret; - - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - * - * Ideally we'd start copygc/rebalance earlier instead of waiting for - * all of recovery/fsck to complete: - */ - ret = bch2_copygc_start(c); - if (ret) { - bch_err(c, "error starting copygc thread"); - return ret; - } - - ret = bch2_rebalance_start(c); - if (ret) { - bch_err(c, "error starting rebalance thread"); - return ret; - } - - return 0; -} - static int __bch2_fs_read_write(struct bch_fs *c, bool early) { int ret; @@ -503,10 +477,17 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) atomic_long_inc(&c->writes[i]); } #endif - if (!early) { - ret = bch2_fs_read_write_late(c); - if (ret) - goto err; + + ret = bch2_copygc_start(c); + if (ret) { + bch_err_msg(c, ret, "error starting copygc thread"); + goto err; + } + + ret = bch2_rebalance_start(c); + if (ret) { + bch_err_msg(c, ret, "error starting rebalance thread"); + goto err; } bch2_do_discards(c); @@ -1082,13 +1063,10 @@ int bch2_fs_start(struct bch_fs *c) wake_up(&c->ro_ref_wait); down_write(&c->state_lock); - if (c->opts.read_only) { + if (c->opts.read_only) bch2_fs_read_only(c); - } else { - ret = !test_bit(BCH_FS_rw, &c->flags) - ? bch2_fs_read_write(c) - : bch2_fs_read_write_late(c); - } + else if (!test_bit(BCH_FS_rw, &c->flags)) + ret = bch2_fs_read_write(c); up_write(&c->state_lock); err: -- cgit v1.2.3 From 4ede80a9a860968f9e63c6b9262683d3139194e0 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 21 Apr 2025 18:37:12 -0400 Subject: bcachefs: Allocator now copes with unaligned buckets We had a buggy release of bcachefs-tools that wasn't properly aligning bucket sizes. We can't ask users to reformat - and it's easy to teach the allocator to make sure writes are properly aligned. Signed-off-by: Kent Overstreet --- fs/bcachefs/alloc_foreground.c | 2 ++ fs/bcachefs/alloc_foreground.h | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 7c930ef77380..effafc3e0ced 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -1425,6 +1425,8 @@ alloc_done: open_bucket_for_each(c, &wp->ptrs, ob, i) wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + wp->sectors_free = rounddown(wp->sectors_free, block_sectors(c)); + BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); return 0; diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 69ec6a012898..4c1e33cf57c0 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -110,7 +110,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) - ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); + ob_push(c, ob->sectors_free < block_sectors(c) + ? &ptrs + : &keep, ob); wp->ptrs = keep; mutex_unlock(&wp->lock); -- cgit v1.2.3 From 7a4a86618eb7bceac17878e484a8f98da1395d68 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 28 Mar 2025 12:26:24 -0400 Subject: bcachefs: Implement fileattr_(get|set) inode_operations.fileattr_(get|set) didn't exist when the various flag ioctls where implemented - but they do now, which means we can delete a bunch of ioctl code in favor of standard VFS level wrappers. Closes: https://lore.kernel.org/linux-bcachefs/7ltgrgqgfummyrlvw7hnfhnu42rfiamoq3lpcvrjnlyytldmzp@yazbhusnztqn/ Cc: Petr Vorel Cc: Andrea Cervesato Cc: Dave Chinner Signed-off-by: Kent Overstreet --- fs/bcachefs/fs-ioctl.c | 217 ------------------------------------------------- fs/bcachefs/fs-ioctl.h | 75 ----------------- fs/bcachefs/fs.c | 173 +++++++++++++++++++++++++++++++++++++++ fs/bcachefs/util.h | 38 +++++++++ 4 files changed, 211 insertions(+), 292 deletions(-) diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 14886e1d4d6d..a82dfce9e4ad 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -21,206 +21,6 @@ #define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ -struct flags_set { - unsigned mask; - unsigned flags; - - unsigned projid; - - bool set_projinherit; - bool projinherit; -}; - -static int bch2_inode_flags_set(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - /* - * We're relying on btree locking here for exclusion with other ioctl - * calls - use the flags in the btree (@bi), not inode->i_flags: - */ - struct flags_set *s = p; - unsigned newflags = s->flags; - unsigned oldflags = bi->bi_flags & s->mask; - - if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - - if (!S_ISREG(bi->bi_mode) && - !S_ISDIR(bi->bi_mode) && - (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) - return -EINVAL; - - if ((newflags ^ oldflags) & BCH_INODE_casefolded) { -#ifdef CONFIG_UNICODE - int ret = 0; - /* Not supported on individual files. */ - if (!S_ISDIR(bi->bi_mode)) - return -EOPNOTSUPP; - - /* - * Make sure the dir is empty, as otherwise we'd need to - * rehash everything and update the dirent keys. - */ - ret = bch2_empty_dir_trans(trans, inode_inum(inode)); - if (ret < 0) - return ret; - - ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); - if (ret) - return ret; - - bch2_check_set_feature(c, BCH_FEATURE_casefolding); -#else - printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); - return -EOPNOTSUPP; -#endif - } - - if (s->set_projinherit) { - bi->bi_fields_set &= ~(1 << Inode_opt_project); - bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); - } - - bi->bi_flags &= ~s->mask; - bi->bi_flags |= newflags; - - bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); - return 0; -} - -static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -{ - unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); - - return put_user(flags, arg); -} - -static int bch2_ioc_setflags(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - void __user *arg) -{ - struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; - unsigned uflags; - int ret; - - if (get_user(uflags, (int __user *) arg)) - return -EFAULT; - - s.flags = map_flags_rev(bch_flags_to_uflags, uflags); - if (uflags) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(&inode->v); - if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { - ret = -EACCES; - goto setflags_out; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - bch2_write_inode(c, inode, bch2_inode_flags_set, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - -setflags_out: - inode_unlock(&inode->v); - mnt_drop_write_file(file); - return ret; -} - -static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, - struct fsxattr __user *arg) -{ - struct fsxattr fa = { 0 }; - - fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); - - if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) - fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; - - fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - - if (copy_to_user(arg, &fa, sizeof(fa))) - return -EFAULT; - - return 0; -} - -static int fssetxattr_inode_update_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct flags_set *s = p; - - if (s->projid != bi->bi_project) { - bi->bi_fields_set |= 1U << Inode_opt_project; - bi->bi_project = s->projid; - } - - return bch2_inode_flags_set(trans, inode, bi, p); -} - -static int bch2_ioc_fssetxattr(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - struct fsxattr __user *arg) -{ - struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; - struct fsxattr fa; - int ret; - - if (copy_from_user(&fa, arg, sizeof(fa))) - return -EFAULT; - - s.set_projinherit = true; - s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; - fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; - - s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); - if (fa.fsx_xflags) - return -EOPNOTSUPP; - - if (fa.fsx_projid >= U32_MAX) - return -EINVAL; - - /* - * inode fields accessible via the xattr interface are stored with a +1 - * bias, so that 0 means unset: - */ - s.projid = fa.fsx_projid + 1; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(&inode->v); - if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { - ret = -EACCES; - goto err; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - bch2_set_projid(c, inode, fa.fsx_projid) ?: - bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); -err: - inode_unlock(&inode->v); - mnt_drop_write_file(file); - return ret; -} - static int bch2_reinherit_attrs_fn(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -558,23 +358,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) long ret; switch (cmd) { - case FS_IOC_GETFLAGS: - ret = bch2_ioc_getflags(inode, (int __user *) arg); - break; - - case FS_IOC_SETFLAGS: - ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); - break; - - case FS_IOC_FSGETXATTR: - ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); - break; - - case FS_IOC_FSSETXATTR: - ret = bch2_ioc_fssetxattr(c, file, inode, - (void __user *) arg); - break; - case BCHFS_IOC_REINHERIT_ATTRS: ret = bch2_ioc_reinherit_attrs(c, file, inode, (void __user *) arg); diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index ecd3bfdcde21..a657e4994b71 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -2,81 +2,6 @@ #ifndef _BCACHEFS_FS_IOCTL_H #define _BCACHEFS_FS_IOCTL_H -/* Inode flags: */ - -/* bcachefs inode flags -> vfs inode flags: */ -static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, - [__BCH_INODE_casefolded] = S_CASEFOLD, -}; - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, - [__BCH_INODE_casefolded] = FS_CASEFOLD_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_sync] = FS_XFLAG_SYNC, - [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_append] = FS_XFLAG_APPEND, - [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, - [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, - //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -}; - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -/* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -{ - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -} - long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 5a41b1a8e54f..e220e3cba29c 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -51,6 +52,19 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_subvolume *); +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +{ + static const __maybe_unused unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, + [__BCH_INODE_casefolded] = S_CASEFOLD, + }; + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); +} + void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -1449,6 +1463,157 @@ static int bch2_open(struct inode *vinode, struct file *file) return generic_file_open(vinode, file); } +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const __maybe_unused unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, + [__BCH_INODE_casefolded] = FS_CASEFOLD_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const __maybe_unused unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_sync] = FS_XFLAG_SYNC, + [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_append] = FS_XFLAG_APPEND, + [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, + [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, +}; + +static int bch2_fileattr_get(struct dentry *dentry, + struct fileattr *fa) +{ + struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); + + fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); + + if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) + fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; + + if (inode->ei_inode.bi_flags & BCH_INODE_casefolded) + fa->flags |= FS_CASEFOLD_FL; + + fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; + return 0; +} + +struct flags_set { + unsigned mask; + unsigned flags; + unsigned projid; + bool set_project; +}; + +static int fssetxattr_inode_update_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = trans->c; + struct flags_set *s = p; + + /* + * We're relying on btree locking here for exclusion with other ioctl + * calls - use the flags in the btree (@bi), not inode->i_flags: + */ + unsigned newflags = s->flags; + unsigned oldflags = bi->bi_flags & s->mask; + + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && + (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) + return -EINVAL; + + if ((newflags ^ oldflags) & BCH_INODE_casefolded) { +#ifdef CONFIG_UNICODE + int ret = 0; + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) + return -EOPNOTSUPP; + + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + ret = bch2_empty_dir_trans(trans, inode_inum(inode)); + if (ret < 0) + return ret; + + ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); + if (ret) + return ret; + + bch2_check_set_feature(c, BCH_FEATURE_casefolding); +#else + printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); + return -EOPNOTSUPP; +#endif + } + + if (s->set_project) { + bi->bi_project = s->projid; + bi->bi_fields_set |= BIT(Inode_opt_project); + } + + bi->bi_flags &= ~s->mask; + bi->bi_flags |= newflags; + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); + return 0; +} + +static int bch2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, + struct fileattr *fa) +{ + struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct flags_set s = {}; + int ret; + + if (fa->fsx_valid) { + fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + + s.mask = map_defined(bch_flags_to_xflags); + s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); + if (fa->fsx_xflags) + return -EOPNOTSUPP; + + if (fa->fsx_projid >= U32_MAX) + return -EINVAL; + + /* + * inode fields accessible via the xattr interface are stored with a +1 + * bias, so that 0 means unset: + */ + if ((inode->ei_inode.bi_project || + fa->fsx_projid) && + inode->ei_inode.bi_project != fa->fsx_projid + 1) { + s.projid = fa->fsx_projid + 1; + s.set_project = true; + } + } + + if (fa->flags_valid) { + s.mask = map_defined(bch_flags_to_uflags); + s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); + if (fa->flags) + return -EOPNOTSUPP; + } + + mutex_lock(&inode->ei_update_lock); + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: + (s.set_project + ? bch2_set_projid(c, inode, fa->fsx_projid) + : 0) ?: + bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + return ret; +} + static const struct file_operations bch_file_operations = { .open = bch2_open, .llseek = bch2_llseek, @@ -1476,6 +1641,8 @@ static const struct inode_operations bch_file_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct inode_operations bch_dir_inode_operations = { @@ -1496,6 +1663,8 @@ static const struct inode_operations bch_dir_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct file_operations bch_dir_file_operations = { @@ -1518,6 +1687,8 @@ static const struct inode_operations bch_symlink_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct inode_operations bch_special_inode_operations = { @@ -1528,6 +1699,8 @@ static const struct inode_operations bch_special_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct address_space_operations bch_address_space_operations = { diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 6ba5071ab6dd..3e52c7f8ddd2 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -739,4 +739,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len) *--dst = *src++; } +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + #endif /* _BCACHEFS_UTIL_H */ -- cgit v1.2.3 From 91037037ee3d611ce17f39d75f79c7de394b122a Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Fri, 18 Apr 2025 10:38:13 +0800 Subject: net/mlx5: Fix null-ptr-deref in mlx5_create_{inner_,}ttc_table() Add NULL check for mlx5_get_flow_namespace() returns in mlx5_create_inner_ttc_table() and mlx5_create_ttc_table() to prevent NULL pointer dereference. Fixes: 137f3d50ad2a ("net/mlx5: Support matching on l4_type for ttc_table") Signed-off-by: Henry Martin Reviewed-by: Mark Bloch Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250418023814.71789-2-bsdhenrymartin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c index eb3bd9c7f66e..228d0f6570d4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c @@ -655,6 +655,11 @@ struct mlx5_ttc_table *mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev, } ns = mlx5_get_flow_namespace(dev, params->ns_type); + if (!ns) { + kvfree(ttc); + return ERR_PTR(-EOPNOTSUPP); + } + groups = use_l4_type ? &inner_ttc_groups[TTC_GROUPS_USE_L4_TYPE] : &inner_ttc_groups[TTC_GROUPS_DEFAULT]; @@ -728,6 +733,11 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, } ns = mlx5_get_flow_namespace(dev, params->ns_type); + if (!ns) { + kvfree(ttc); + return ERR_PTR(-EOPNOTSUPP); + } + groups = use_l4_type ? &ttc_groups[TTC_GROUPS_USE_L4_TYPE] : &ttc_groups[TTC_GROUPS_DEFAULT]; -- cgit v1.2.3 From fa8fd315127ca48c65e7e6692a84ffcf3d07168e Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Fri, 18 Apr 2025 10:38:14 +0800 Subject: net/mlx5: Move ttc allocation after switch case to prevent leaks Relocate the memory allocation for ttc table after the switch statement that validates params->ns_type in both mlx5_create_inner_ttc_table() and mlx5_create_ttc_table(). This ensures memory is only allocated after confirming valid input, eliminating potential memory leaks when invalid ns_type cases occur. Fixes: 137f3d50ad2a ("net/mlx5: Support matching on l4_type for ttc_table") Signed-off-by: Henry Martin Reviewed-by: Michal Swiatkowski Reviewed-by: Mark Bloch Link: https://patch.msgid.link/20250418023814.71789-3-bsdhenrymartin@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c index 228d0f6570d4..ca9ecec358b2 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/fs_ttc.c @@ -637,10 +637,6 @@ struct mlx5_ttc_table *mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev, bool use_l4_type; int err; - ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL); - if (!ttc) - return ERR_PTR(-ENOMEM); - switch (params->ns_type) { case MLX5_FLOW_NAMESPACE_PORT_SEL: use_l4_type = MLX5_CAP_GEN_2(dev, pcc_ifa2) && @@ -654,6 +650,10 @@ struct mlx5_ttc_table *mlx5_create_inner_ttc_table(struct mlx5_core_dev *dev, return ERR_PTR(-EINVAL); } + ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL); + if (!ttc) + return ERR_PTR(-ENOMEM); + ns = mlx5_get_flow_namespace(dev, params->ns_type); if (!ns) { kvfree(ttc); @@ -715,10 +715,6 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, bool use_l4_type; int err; - ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL); - if (!ttc) - return ERR_PTR(-ENOMEM); - switch (params->ns_type) { case MLX5_FLOW_NAMESPACE_PORT_SEL: use_l4_type = MLX5_CAP_GEN_2(dev, pcc_ifa2) && @@ -732,6 +728,10 @@ struct mlx5_ttc_table *mlx5_create_ttc_table(struct mlx5_core_dev *dev, return ERR_PTR(-EINVAL); } + ttc = kvzalloc(sizeof(*ttc), GFP_KERNEL); + if (!ttc) + return ERR_PTR(-ENOMEM); + ns = mlx5_get_flow_namespace(dev, params->ns_type); if (!ns) { kvfree(ttc); -- cgit v1.2.3 From d3153c3b42707d26c81083b426f2ef0951bce545 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Thu, 17 Apr 2025 18:53:17 -0700 Subject: net: fix the missing unlock for detached devices The combined condition was left as is when we converted from __dev_get_by_index() to netdev_get_by_index_lock(). There was no need to undo anything with the former, for the latter we need an unlock. Fixes: 1d22d3060b9b ("net: drop rtnl_lock for queue_mgmt operations") Reviewed-by: Mina Almasry Link: https://patch.msgid.link/20250418015317.1954107-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- net/core/netdev-genl.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/net/core/netdev-genl.c b/net/core/netdev-genl.c index 5d7af50fe702..230743bdbb14 100644 --- a/net/core/netdev-genl.c +++ b/net/core/netdev-genl.c @@ -861,14 +861,17 @@ int netdev_nl_bind_rx_doit(struct sk_buff *skb, struct genl_info *info) mutex_lock(&priv->lock); + err = 0; netdev = netdev_get_by_index_lock(genl_info_net(info), ifindex); - if (!netdev || !netif_device_present(netdev)) { + if (!netdev) { err = -ENODEV; goto err_unlock_sock; } - - if (!netdev_need_ops_lock(netdev)) { + if (!netif_device_present(netdev)) + err = -ENODEV; + else if (!netdev_need_ops_lock(netdev)) err = -EOPNOTSUPP; + if (err) { NL_SET_BAD_ATTR(info->extack, info->attrs[NETDEV_A_DEV_IFINDEX]); goto err_unlock; -- cgit v1.2.3 From 54bebe46871d4e56e05fcf55c1a37e7efa24e0a8 Mon Sep 17 00:00:00 2001 From: Anastasia Kovaleva Date: Mon, 24 Mar 2025 11:49:33 +0300 Subject: scsi: core: Clear flags for scsi_cmnd that did not complete Commands that have not been completed with scsi_done() do not clear the SCMD_INITIALIZED flag and therefore will not be properly reinitialized. Thus, the next time the scsi_cmnd structure is used, the command may fail in scsi_cmd_runtime_exceeded() due to the old jiffies_at_alloc value: kernel: sd 16:0:1:84: [sdts] tag#405 timing out command, waited 720s kernel: sd 16:0:1:84: [sdts] tag#405 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=66636s Clear flags for commands that have not been completed by SCSI. Fixes: 4abafdc4360d ("block: remove the initialize_rq_fn blk_mq_ops method") Signed-off-by: Anastasia Kovaleva Link: https://lore.kernel.org/r/20250324084933.15932-2-a.kovaleva@yadro.com Signed-off-by: Martin K. Petersen --- drivers/scsi/scsi_lib.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 0d29470e86b0..1b43013d72c0 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1253,8 +1253,12 @@ EXPORT_SYMBOL_GPL(scsi_alloc_request); */ static void scsi_cleanup_rq(struct request *rq) { + struct scsi_cmnd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->flags = 0; + if (rq->rq_flags & RQF_DONTPREP) { - scsi_mq_uninit_cmd(blk_mq_rq_to_pdu(rq)); + scsi_mq_uninit_cmd(cmd); rq->rq_flags &= ~RQF_DONTPREP; } } -- cgit v1.2.3 From 08a966a917fe3d92150fa3cc15793ad5e57051eb Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Sat, 12 Apr 2025 14:59:09 -0500 Subject: scsi: ufs: core: Add NULL check in ufshcd_mcq_compl_pending_transfer() Add a NULL check for the returned hwq pointer by ufshcd_mcq_req_to_hwq(). This is similar to the fix in commit 74736103fb41 ("scsi: ufs: core: Fix ufshcd_abort_one racing issue"). Signed-off-by: Chenyuan Yang Link: https://lore.kernel.org/r/20250412195909.315418-1-chenyuan0y@gmail.com Fixes: ab248643d3d6 ("scsi: ufs: core: Add error handling for MCQ mode") Reviewed-by: Peter Wang Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen --- drivers/ufs/core/ufshcd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index ca2b0aae9d11..5cb6132b8147 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -5678,6 +5678,8 @@ static void ufshcd_mcq_compl_pending_transfer(struct ufs_hba *hba, continue; hwq = ufshcd_mcq_req_to_hwq(hba, scsi_cmd_to_rq(cmd)); + if (!hwq) + continue; if (force_compl) { ufshcd_mcq_compl_all_cqes_lock(hba, hwq); -- cgit v1.2.3 From b0b7ee3b574a72283399b9232f6190be07f220c0 Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Tue, 15 Apr 2025 15:45:46 +0530 Subject: scsi: mpi3mr: Add level check to control event logging Ensure event logs are only generated when the debug logging level MPI3_DEBUG_EVENT is enabled. This prevents unnecessary logging. Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20250415101546.204018-1-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 003e1f7005c4..1d7901a8f0e4 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -174,6 +174,9 @@ static void mpi3mr_print_event_data(struct mpi3mr_ioc *mrioc, char *desc = NULL; u16 event; + if (!(mrioc->logging_level & MPI3_DEBUG_EVENT)) + return; + event = event_reply->event; switch (event) { -- cgit v1.2.3 From c083da15f06c808c44444bfcef3c939a07ad394b Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 17 Apr 2025 11:15:01 +0100 Subject: MAINTAINERS: Add ism.h to S390 NETWORKING DRIVERS ism.h appears to be part of s390 networking drivers so add it to the corresponding section in MAINTAINERS. This aids developers, and tooling such as get_maintainer.pl alike to CC patches to the appropriate people and mailing lists. And is in keeping with an ongoing effort for NETWORKING entries in MAINTAINERS to more accurately reflect the way code is maintained. Signed-off-by: Simon Horman Link: https://patch.msgid.link/20250417-ism-maint-v1-1-b001be8545ce@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 067f72e8af10..aef5f9a8acf5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -21270,6 +21270,7 @@ L: linux-s390@vger.kernel.org L: netdev@vger.kernel.org S: Supported F: drivers/s390/net/ +F: include/linux/ism.h S390 PCI SUBSYSTEM M: Niklas Schnelle -- cgit v1.2.3 From e00c1517f2bc73186a18ac2cb1d6c5fee7e95239 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 17 Apr 2025 11:15:02 +0100 Subject: MAINTAINERS: Add s390 networking drivers to NETWORKING DRIVERS These files are already correctly covered by the S390 NETWORKING DRIVERS section. In practice commits for these drivers feed into the Networking subsystem. So it seems appropriate to also list them under NETWORKING DRIVERS. This aids developers, and tooling such as get_maintainer.pl alike to CC patches to all the appropriate people and mailing lists. And is in keeping with an ongoing effort for NETWORKING entries in MAINTAINERS to more accurately reflect the way code is maintained. Signed-off-by: Simon Horman Link: https://patch.msgid.link/20250417-ism-maint-v1-2-b001be8545ce@kernel.org Signed-off-by: Jakub Kicinski --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index aef5f9a8acf5..d980da72ec60 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -16770,6 +16770,7 @@ F: Documentation/networking/net_cachelines/net_device.rst F: drivers/connector/ F: drivers/net/ F: drivers/ptp/ +F: drivers/s390/net/ F: include/dt-bindings/net/ F: include/linux/cn_proc.h F: include/linux/etherdevice.h @@ -16779,6 +16780,7 @@ F: include/linux/fddidevice.h F: include/linux/hippidevice.h F: include/linux/if_* F: include/linux/inetdevice.h +F: include/linux/ism.h F: include/linux/netdev* F: include/linux/platform_data/wiznet.h F: include/uapi/linux/cn_proc.h -- cgit v1.2.3 From cc3628dcd851ddd8d418bf0c897024b4621ddc92 Mon Sep 17 00:00:00 2001 From: Alexey Nepomnyashih Date: Thu, 17 Apr 2025 12:21:17 +0000 Subject: xen-netfront: handle NULL returned by xdp_convert_buff_to_frame() The function xdp_convert_buff_to_frame() may return NULL if it fails to correctly convert the XDP buffer into an XDP frame due to memory constraints, internal errors, or invalid data. Failing to check for NULL may lead to a NULL pointer dereference if the result is used later in processing, potentially causing crashes, data corruption, or undefined behavior. On XDP redirect failure, the associated page must be released explicitly if it was previously retained via get_page(). Failing to do so may result in a memory leak, as the pages reference count is not decremented. Cc: stable@vger.kernel.org # v5.9+ Fixes: 6c5aa6fc4def ("xen networking: add basic XDP support for xen-netfront") Signed-off-by: Alexey Nepomnyashih Link: https://patch.msgid.link/20250417122118.1009824-1-sdl@nppct.ru Signed-off-by: Jakub Kicinski --- drivers/net/xen-netfront.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index fc52d5c4c69b..5091e1fa4a0d 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -985,20 +985,27 @@ static u32 xennet_run_xdp(struct netfront_queue *queue, struct page *pdata, act = bpf_prog_run_xdp(prog, xdp); switch (act) { case XDP_TX: - get_page(pdata); xdpf = xdp_convert_buff_to_frame(xdp); + if (unlikely(!xdpf)) { + trace_xdp_exception(queue->info->netdev, prog, act); + break; + } + get_page(pdata); err = xennet_xdp_xmit(queue->info->netdev, 1, &xdpf, 0); - if (unlikely(!err)) + if (unlikely(err <= 0)) { + if (err < 0) + trace_xdp_exception(queue->info->netdev, prog, act); xdp_return_frame_rx_napi(xdpf); - else if (unlikely(err < 0)) - trace_xdp_exception(queue->info->netdev, prog, act); + } break; case XDP_REDIRECT: get_page(pdata); err = xdp_do_redirect(queue->info->netdev, xdp, prog); *need_xdp_flush = true; - if (unlikely(err)) + if (unlikely(err)) { trace_xdp_exception(queue->info->netdev, prog, act); + xdp_return_buff(xdp); + } break; case XDP_PASS: case XDP_DROP: -- cgit v1.2.3 From 2768b2e2f7d25ae8984ebdcde8ec1014b6fdcd89 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:03 +0300 Subject: net: enetc: register XDP RX queues with frag_size At the time when bpf_xdp_adjust_tail() gained support for non-linear buffers, ENETC was already generating this kind of geometry on RX, due to its use of 2K half page buffers. Frames larger than 1472 bytes (without FCS) are stored as multi-buffer, presenting a need for multi buffer support to work properly even in standard MTU circumstances. Allow bpf_xdp_frags_increase_tail() to know the allocation size of paged data, so it can safely permit growing the tailroom of the buffer from XDP programs. Fixes: bf25146a5595 ("bpf: add frags support to the bpf_xdp_adjust_tail() API") Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 2106861463e4..9b333254c73e 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -3362,7 +3362,8 @@ static int enetc_int_vector_init(struct enetc_ndev_priv *priv, int i, bdr->buffer_offset = ENETC_RXB_PAD; priv->rx_ring[i] = bdr; - err = xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0); + err = __xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0, + ENETC_RXB_DMA_SIZE_XDP); if (err) goto free_vector; -- cgit v1.2.3 From 1d587faa5be7e9785b682cc5f58ba8f4100c13ea Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:04 +0300 Subject: net: enetc: refactor bulk flipping of RX buffers to separate function This small snippet of code ensures that we do something with the array of RX software buffer descriptor elements after passing the skb to the stack. In this case, we see if the other half of the page is reusable, and if so, we "turn around" the buffers, making them directly usable by enetc_refill_rx_ring() without going to enetc_new_page(). We will need to perform this kind of buffer flipping from a new code path, i.e. from XDP_PASS. Currently, enetc_build_skb() does it there buffer by buffer, but in a subsequent change we will stop using enetc_build_skb() for XDP_PASS. Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 9b333254c73e..74721995cb1f 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1850,6 +1850,16 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, } } +static void enetc_bulk_flip_buff(struct enetc_bdr *rx_ring, int rx_ring_first, + int rx_ring_last) +{ + while (rx_ring_first != rx_ring_last) { + enetc_flip_rx_buff(rx_ring, + &rx_ring->rx_swbd[rx_ring_first]); + enetc_bdr_idx_inc(rx_ring, &rx_ring_first); + } +} + static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) @@ -1965,11 +1975,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, enetc_xdp_drop(rx_ring, orig_i, i); rx_ring->stats.xdp_redirect_failures++; } else { - while (orig_i != i) { - enetc_flip_rx_buff(rx_ring, - &rx_ring->rx_swbd[orig_i]); - enetc_bdr_idx_inc(rx_ring, &orig_i); - } + enetc_bulk_flip_buff(rx_ring, orig_i, i); xdp_redirect_frm_cnt++; rx_ring->stats.xdp_redirect++; } -- cgit v1.2.3 From 020f0c8b3d396ec8190948f86063e1c45133f839 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:05 +0300 Subject: net: enetc: fix frame corruption on bpf_xdp_adjust_head/tail() and XDP_PASS Vlatko Markovikj reported that XDP programs attached to ENETC do not work well if they use bpf_xdp_adjust_head() or bpf_xdp_adjust_tail(), combined with the XDP_PASS verdict. A typical use case is to add or remove a VLAN tag. The resulting sk_buff passed to the stack is corrupted, because the algorithm used by the driver for XDP_PASS is to unwind the current buffer pointer in the RX ring and to re-process the current frame with enetc_build_skb() as if XDP hadn't run. That is incorrect because XDP may have modified the geometry of the buffer, which we then are completely unaware of. We are looking at a modified buffer with the original geometry. The initial reaction, both from me and from Vlatko, was to shop around the kernel for code to steal that would calculate a delta between the old and the new XDP buffer geometry, and apply that to the sk_buff too. We noticed that veth and generic xdp have such code. The headroom adjustment is pretty uncontroversial, but what turned out severely problematic is the tailroom. veth has this snippet: __skb_put(skb, off); /* positive on grow, negative on shrink */ which on first sight looks decent enough, except __skb_put() takes an "unsigned int" for the second argument, and the arithmetic seems to only work correctly by coincidence. Second issue, __skb_put() contains a SKB_LINEAR_ASSERT(). It's not a great pattern to make more widespread. The skb may still be nonlinear at that point - it only becomes linear later when resetting skb->data_len to zero. To avoid the above, bpf_prog_run_generic_xdp() does this instead: skb_set_tail_pointer(skb, xdp->data_end - xdp->data); skb->len += off; /* positive on grow, negative on shrink */ which is more open-coded, uses lower-level functions and is in general a bit too much to spread around in driver code. Then there is the snippet: if (xdp_buff_has_frags(xdp)) skb->data_len = skb_shinfo(skb)->xdp_frags_size; else skb->data_len = 0; One would have expected __pskb_trim() to be the function of choice for this task. But it's not used in veth/xdpgeneric because the extraneous fragments were _already_ freed by bpf_xdp_adjust_tail() -> bpf_xdp_frags_shrink_tail() -> ... -> __xdp_return() - the backing memory for the skb frags and the xdp frags is the same, but they don't keep individual references. In fact, that is the biggest reason why this snippet cannot be reused as-is, because ENETC temporarily constructs an skb with the original len and the original number of frags. Because the extraneous frags are already freed by bpf_xdp_adjust_tail() and returned to the page allocator, it means the entire approach of using enetc_build_skb() is questionable for XDP_PASS. To avoid that, one would need to elevate the page refcount of all frags before calling bpf_prog_run_xdp() and drop it after XDP_PASS. There are other things that are missing in ENETC's handling of XDP_PASS, like for example updating skb_shinfo(skb)->meta_len. These are all handled correctly and cleanly in commit 539c1fba1ac7 ("xdp: add generic xdp_build_skb_from_buff()"), added to net-next in Dec 2024, and in addition might even be quicker that way. I have a very strong preference towards backporting that commit for "stable", and that is what is used to fix the handling bugs. It is way too messy to go this deep into the guts of an sk_buff from the code of a device driver. Fixes: d1b15102dd16 ("net: enetc: add support for XDP_DROP and XDP_PASS") Reported-by: Vlatko Markovikj Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-4-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/freescale/enetc/enetc.c | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 74721995cb1f..3ee52f4b1166 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1878,11 +1878,10 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, while (likely(rx_frm_cnt < work_limit)) { union enetc_rx_bd *rxbd, *orig_rxbd; - int orig_i, orig_cleaned_cnt; struct xdp_buff xdp_buff; struct sk_buff *skb; + int orig_i, err; u32 bd_status; - int err; rxbd = enetc_rxbd(rx_ring, i); bd_status = le32_to_cpu(rxbd->r.lstatus); @@ -1897,7 +1896,6 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, break; orig_rxbd = rxbd; - orig_cleaned_cnt = cleaned_cnt; orig_i = i; enetc_build_xdp_buff(rx_ring, bd_status, &rxbd, &i, @@ -1925,15 +1923,21 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->stats.xdp_drops++; break; case XDP_PASS: - rxbd = orig_rxbd; - cleaned_cnt = orig_cleaned_cnt; - i = orig_i; - - skb = enetc_build_skb(rx_ring, bd_status, &rxbd, - &i, &cleaned_cnt, - ENETC_RXB_DMA_SIZE_XDP); - if (unlikely(!skb)) + skb = xdp_build_skb_from_buff(&xdp_buff); + /* Probably under memory pressure, stop NAPI */ + if (unlikely(!skb)) { + enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_drops++; goto out; + } + + enetc_get_offloads(rx_ring, orig_rxbd, skb); + + /* These buffers are about to be owned by the stack. + * Update our buffer cache (the rx_swbd array elements) + * with their other page halves. + */ + enetc_bulk_flip_buff(rx_ring, orig_i, i); napi_gro_receive(napi, skb); break; -- cgit v1.2.3 From db91586b1e8f36122a9e5b8fbced11741488dd22 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 18 Apr 2025 15:40:14 +0900 Subject: ata: libata-scsi: Fix ata_mselect_control_ata_feature() return type The function ata_mselect_control_ata_feature() has a return type defined as unsigned int but this function may return negative error codes, which are correctly propagated up the call chain as integers. Fix ata_mselect_control_ata_feature() to have the correct int return type. While at it, also fix a typo in this function description comment. Fixes: df60f9c64576 ("scsi: ata: libata: Add ATA feature control sub-page translation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Igor Pylypiv --- drivers/ata/libata-scsi.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2796c0da8257..24e662c837e3 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -3886,12 +3886,11 @@ static int ata_mselect_control_spg0(struct ata_queued_cmd *qc, } /* - * Translate MODE SELECT control mode page, sub-pages f2h (ATA feature mode + * Translate MODE SELECT control mode page, sub-page f2h (ATA feature mode * page) into a SET FEATURES command. */ -static unsigned int ata_mselect_control_ata_feature(struct ata_queued_cmd *qc, - const u8 *buf, int len, - u16 *fp) +static int ata_mselect_control_ata_feature(struct ata_queued_cmd *qc, + const u8 *buf, int len, u16 *fp) { struct ata_device *dev = qc->dev; struct ata_taskfile *tf = &qc->tf; -- cgit v1.2.3 From 88474ad734fb2000805c63e01cc53ea930adf2c7 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Apr 2025 14:45:30 +0900 Subject: ata: libata-scsi: Fix ata_msense_control_ata_feature() For the ATA features subpage of the control mode page, the T10 SAT-6 specifications state that: For a MODE SENSE command, the SATL shall return the CDL_CTRL field value that was last set by an application client. However, the function ata_msense_control_ata_feature() always sets the CDL_CTRL field to the 0x02 value to indicate support for the CDL T2A and T2B pages. This is thus incorrect and the value 0x02 must be reported only after the user enables the CDL feature, which is indicated with the ATA_DFLAG_CDL_ENABLED device flag. When this flag is not set, the CDL_CTRL field of the ATA feature subpage of the control mode page must report a value of 0x00. Fix ata_msense_control_ata_feature() to report the correct values for the CDL_CTRL field, according to the enable/disable state of the device CDL feature. Fixes: df60f9c64576 ("scsi: ata: libata: Add ATA feature control sub-page translation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Igor Pylypiv --- drivers/ata/libata-scsi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 24e662c837e3..a41046cb062c 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -2453,8 +2453,8 @@ static unsigned int ata_msense_control_ata_feature(struct ata_device *dev, */ put_unaligned_be16(ATA_FEATURE_SUB_MPAGE_LEN - 4, &buf[2]); - if (dev->flags & ATA_DFLAG_CDL) - buf[4] = 0x02; /* Support T2A and T2B pages */ + if (dev->flags & ATA_DFLAG_CDL_ENABLED) + buf[4] = 0x02; /* T2A and T2B pages enabled */ else buf[4] = 0; -- cgit v1.2.3 From 17e897a456752ec9c2d7afb3d9baf268b442451b Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 14 Apr 2025 10:25:05 +0900 Subject: ata: libata-scsi: Improve CDL control With ATA devices supporting the CDL feature, using CDL requires that the feature be enabled with a SET FEATURES command. This command is issued as the translated command for the MODE SELECT command issued by scsi_cdl_enable() when the user enables CDL through the device cdl_enable sysfs attribute. Currently, ata_mselect_control_ata_feature() always translates a MODE SELECT command for the ATA features subpage of the control mode page to a SET FEATURES command to enable or disable CDL based on the cdl_ctrl field. However, there is no need to issue the SET FEATURES command if: 1) The MODE SELECT command requests disabling CDL and CDL is already disabled. 2) The MODE SELECT command requests enabling CDL and CDL is already enabled. Fix ata_mselect_control_ata_feature() to issue the SET FEATURES command only when necessary. Since enabling CDL also implies a reset of the CDL statistics log page, avoiding useless CDL enable operations also avoids clearing the CDL statistics log. Also add debug messages to clearly signal when CDL is being enabled or disabled using a SET FEATURES command. Fixes: df60f9c64576 ("scsi: ata: libata: Add ATA feature control sub-page translation") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Igor Pylypiv --- drivers/ata/libata-scsi.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index a41046cb062c..c0eb8c67a9ff 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -3908,17 +3908,27 @@ static int ata_mselect_control_ata_feature(struct ata_queued_cmd *qc, /* Check cdl_ctrl */ switch (buf[0] & 0x03) { case 0: - /* Disable CDL */ + /* Disable CDL if it is enabled */ + if (!(dev->flags & ATA_DFLAG_CDL_ENABLED)) + return 0; + ata_dev_dbg(dev, "Disabling CDL\n"); cdl_action = 0; dev->flags &= ~ATA_DFLAG_CDL_ENABLED; break; case 0x02: - /* Enable CDL T2A/T2B: NCQ priority must be disabled */ + /* + * Enable CDL if not already enabled. Since this is mutually + * exclusive with NCQ priority, allow this only if NCQ priority + * is disabled. + */ + if (dev->flags & ATA_DFLAG_CDL_ENABLED) + return 0; if (dev->flags & ATA_DFLAG_NCQ_PRIO_ENABLED) { ata_dev_err(dev, "NCQ priority must be disabled to enable CDL\n"); return -EINVAL; } + ata_dev_dbg(dev, "Enabling CDL\n"); cdl_action = 1; dev->flags |= ATA_DFLAG_CDL_ENABLED; break; -- cgit v1.2.3 From 14a3cc755825ef7b34c986aa2786ea815023e9c5 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Sun, 13 Apr 2025 11:24:47 +0900 Subject: scsi: Improve CDL control With ATA devices supporting the CDL feature, using CDL requires that the feature be enabled with a SET FEATURES command. This command is issued as the translated command for the MODE SELECT command issued by scsi_cdl_enable() when the user enables CDL through the device cdl_enable sysfs attribute. However, the implementation of scsi_cdl_enable() always issues a MODE SELECT command for ATA devices when the enable argument is true, even if CDL is already enabled on the device. While this does not cause any issue with using CDL descriptors with read/write commands (the CDL feature will be enabled on the drive), issuing the MODE SELECT command even when the device CDL feature is already enabled will cause a reset of the ATA device CDL statistics log page (as defined in ACS, any CDL enable action must reset the device statistics). Avoid this needless actions (and the implied statistics log page reset) by modifying scsi_cdl_enable() to issue the MODE SELECT command to enable CDL if and only if CDL is not reported as already enabled on the device. And while at it, simplify the initialization of the is_ata boolean variable and move the declaration of the scsi mode data and sense header variables to within the scope of ATA device handling. Fixes: 1b22cfb14142 ("scsi: core: Allow enabling and disabling command duration limits") Cc: stable@vger.kernel.org Signed-off-by: Damien Le Moal Reviewed-by: Niklas Cassel Reviewed-by: Igor Pylypiv Reviewed-by: Martin K. Petersen --- drivers/scsi/scsi.c | 36 ++++++++++++++++++++++++------------ 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/scsi.c b/drivers/scsi/scsi.c index 53daf923ad8e..518a252eb6aa 100644 --- a/drivers/scsi/scsi.c +++ b/drivers/scsi/scsi.c @@ -707,26 +707,23 @@ void scsi_cdl_check(struct scsi_device *sdev) */ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) { - struct scsi_mode_data data; - struct scsi_sense_hdr sshdr; - struct scsi_vpd *vpd; - bool is_ata = false; char buf[64]; + bool is_ata; int ret; if (!sdev->cdl_supported) return -EOPNOTSUPP; rcu_read_lock(); - vpd = rcu_dereference(sdev->vpd_pg89); - if (vpd) - is_ata = true; + is_ata = rcu_dereference(sdev->vpd_pg89); rcu_read_unlock(); /* * For ATA devices, CDL needs to be enabled with a SET FEATURES command. */ if (is_ata) { + struct scsi_mode_data data; + struct scsi_sense_hdr sshdr; char *buf_data; int len; @@ -735,16 +732,30 @@ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) if (ret) return -EINVAL; - /* Enable CDL using the ATA feature page */ + /* Enable or disable CDL using the ATA feature page */ len = min_t(size_t, sizeof(buf), data.length - data.header_length - data.block_descriptor_length); buf_data = buf + data.header_length + data.block_descriptor_length; - if (enable) - buf_data[4] = 0x02; - else - buf_data[4] = 0; + + /* + * If we want to enable CDL and CDL is already enabled on the + * device, do nothing. This avoids needlessly resetting the CDL + * statistics on the device as that is implied by the CDL enable + * action. Similar to this, there is no need to do anything if + * we want to disable CDL and CDL is already disabled. + */ + if (enable) { + if ((buf_data[4] & 0x03) == 0x02) + goto out; + buf_data[4] &= ~0x03; + buf_data[4] |= 0x02; + } else { + if ((buf_data[4] & 0x03) == 0x00) + goto out; + buf_data[4] &= ~0x03; + } ret = scsi_mode_select(sdev, 1, 0, buf_data, len, 5 * HZ, 3, &data, &sshdr); @@ -756,6 +767,7 @@ int scsi_cdl_enable(struct scsi_device *sdev, bool enable) } } +out: sdev->cdl_enable = enable; return 0; -- cgit v1.2.3 From f37bb5486ea536c1d61df89feeaeff3f84f0b560 Mon Sep 17 00:00:00 2001 From: Christian Hewitt Date: Mon, 21 Apr 2025 22:12:59 +0200 Subject: Revert "drm/meson: vclk: fix calculation of 59.94 fractional rates" This reverts commit bfbc68e. The patch does permit the offending YUV420 @ 59.94 phy_freq and vclk_freq mode to match in calculations. It also results in all fractional rates being unavailable for use. This was unintended and requires the patch to be reverted. Fixes: bfbc68e4d869 ("drm/meson: vclk: fix calculation of 59.94 fractional rates") Cc: stable@vger.kernel.org Signed-off-by: Christian Hewitt Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250421201300.778955-2-martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250421201300.778955-2-martin.blumenstingl@googlemail.com --- drivers/gpu/drm/meson/meson_vclk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_vclk.c b/drivers/gpu/drm/meson/meson_vclk.c index 2a942dc6a6dc..2a82119eb58e 100644 --- a/drivers/gpu/drm/meson/meson_vclk.c +++ b/drivers/gpu/drm/meson/meson_vclk.c @@ -790,13 +790,13 @@ meson_vclk_vic_supported_freq(struct meson_drm *priv, unsigned int phy_freq, FREQ_1000_1001(params[i].pixel_freq)); DRM_DEBUG_DRIVER("i = %d phy_freq = %d alt = %d\n", i, params[i].phy_freq, - FREQ_1000_1001(params[i].phy_freq/1000)*1000); + FREQ_1000_1001(params[i].phy_freq/10)*10); /* Match strict frequency */ if (phy_freq == params[i].phy_freq && vclk_freq == params[i].vclk_freq) return MODE_OK; /* Match 1000/1001 variant */ - if (phy_freq == (FREQ_1000_1001(params[i].phy_freq/1000)*1000) && + if (phy_freq == (FREQ_1000_1001(params[i].phy_freq/10)*10) && vclk_freq == FREQ_1000_1001(params[i].vclk_freq)) return MODE_OK; } @@ -1070,7 +1070,7 @@ void meson_vclk_setup(struct meson_drm *priv, unsigned int target, for (freq = 0 ; params[freq].pixel_freq ; ++freq) { if ((phy_freq == params[freq].phy_freq || - phy_freq == FREQ_1000_1001(params[freq].phy_freq/1000)*1000) && + phy_freq == FREQ_1000_1001(params[freq].phy_freq/10)*10) && (vclk_freq == params[freq].vclk_freq || vclk_freq == FREQ_1000_1001(params[freq].vclk_freq))) { if (vclk_freq != params[freq].vclk_freq) -- cgit v1.2.3 From 1017560164b6bbcbc93579266926e6e96675262a Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Mon, 21 Apr 2025 22:13:00 +0200 Subject: drm/meson: use unsigned long long / Hz for frequency types Christian reports that 4K output using YUV420 encoding fails with the following error: Fatal Error, invalid HDMI vclk freq 593406 Modetest shows the following: 3840x2160 59.94 3840 4016 4104 4400 2160 2168 2178 2250 593407 flags: xxxx, xxxx, drm calculated value -------------------------------------^ This indicates that there's a (1kHz) mismatch between the clock calculated by the drm framework and the meson driver. Relevant function call stack: (drm framework) -> meson_encoder_hdmi_atomic_enable() -> meson_encoder_hdmi_set_vclk() -> meson_vclk_setup() The video clock requested by the drm framework is 593407kHz. This is passed by meson_encoder_hdmi_atomic_enable() to meson_encoder_hdmi_set_vclk() and the following formula is applied: - the frequency is halved (which would be 296703.5kHz) and rounded down to the next full integer, which is 296703kHz - TMDS clock is calculated (296703kHz * 10) - video encoder clock is calculated - this needs to match a table from meson_vclk.c and so it doubles the previously halved value again (resulting in 593406kHz) - meson_vclk_setup() can't find (either directly, or by deriving it from 594000kHz * 1000 / 1001 and rounding to the closest integer value - which is 593407kHz as originally requested by the drm framework) a matching clock in it's internal table and errors out with "invalid HDMI vclk freq" Fix the division precision by switching the whole meson driver to use unsigned long long (64-bit) Hz values for clock frequencies instead of unsigned int (32-bit) kHz to fix the rouding error. Fixes: e5fab2ec9ca4 ("drm/meson: vclk: add support for YUV420 setup") Reported-by: Christian Hewitt Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250421201300.778955-3-martin.blumenstingl@googlemail.com Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250421201300.778955-3-martin.blumenstingl@googlemail.com --- drivers/gpu/drm/meson/meson_drv.c | 2 +- drivers/gpu/drm/meson/meson_drv.h | 2 +- drivers/gpu/drm/meson/meson_encoder_hdmi.c | 29 +++-- drivers/gpu/drm/meson/meson_vclk.c | 195 +++++++++++++++-------------- drivers/gpu/drm/meson/meson_vclk.h | 13 +- 5 files changed, 126 insertions(+), 115 deletions(-) diff --git a/drivers/gpu/drm/meson/meson_drv.c b/drivers/gpu/drm/meson/meson_drv.c index 81d2ee37e773..49ff9f1f16d3 100644 --- a/drivers/gpu/drm/meson/meson_drv.c +++ b/drivers/gpu/drm/meson/meson_drv.c @@ -169,7 +169,7 @@ static const struct meson_drm_soc_attr meson_drm_soc_attrs[] = { /* S805X/S805Y HDMI PLL won't lock for HDMI PHY freq > 1,65GHz */ { .limits = { - .max_hdmi_phy_freq = 1650000, + .max_hdmi_phy_freq = 1650000000, }, .attrs = (const struct soc_device_attribute []) { { .soc_id = "GXL (S805*)", }, diff --git a/drivers/gpu/drm/meson/meson_drv.h b/drivers/gpu/drm/meson/meson_drv.h index 3f9345c14f31..be4b0e4df6e1 100644 --- a/drivers/gpu/drm/meson/meson_drv.h +++ b/drivers/gpu/drm/meson/meson_drv.h @@ -37,7 +37,7 @@ struct meson_drm_match_data { }; struct meson_drm_soc_limits { - unsigned int max_hdmi_phy_freq; + unsigned long long max_hdmi_phy_freq; }; struct meson_drm { diff --git a/drivers/gpu/drm/meson/meson_encoder_hdmi.c b/drivers/gpu/drm/meson/meson_encoder_hdmi.c index 6d1c9262a2cf..7752d8ac85f0 100644 --- a/drivers/gpu/drm/meson/meson_encoder_hdmi.c +++ b/drivers/gpu/drm/meson/meson_encoder_hdmi.c @@ -70,12 +70,12 @@ static void meson_encoder_hdmi_set_vclk(struct meson_encoder_hdmi *encoder_hdmi, { struct meson_drm *priv = encoder_hdmi->priv; int vic = drm_match_cea_mode(mode); - unsigned int phy_freq; - unsigned int vclk_freq; - unsigned int venc_freq; - unsigned int hdmi_freq; + unsigned long long phy_freq; + unsigned long long vclk_freq; + unsigned long long venc_freq; + unsigned long long hdmi_freq; - vclk_freq = mode->clock; + vclk_freq = mode->clock * 1000; /* For 420, pixel clock is half unlike venc clock */ if (encoder_hdmi->output_bus_fmt == MEDIA_BUS_FMT_UYYVYY8_0_5X24) @@ -107,7 +107,8 @@ static void meson_encoder_hdmi_set_vclk(struct meson_encoder_hdmi *encoder_hdmi, if (mode->flags & DRM_MODE_FLAG_DBLCLK) venc_freq /= 2; - dev_dbg(priv->dev, "vclk:%d phy=%d venc=%d hdmi=%d enci=%d\n", + dev_dbg(priv->dev, + "vclk:%lluHz phy=%lluHz venc=%lluHz hdmi=%lluHz enci=%d\n", phy_freq, vclk_freq, venc_freq, hdmi_freq, priv->venc.hdmi_use_enci); @@ -122,10 +123,11 @@ static enum drm_mode_status meson_encoder_hdmi_mode_valid(struct drm_bridge *bri struct meson_encoder_hdmi *encoder_hdmi = bridge_to_meson_encoder_hdmi(bridge); struct meson_drm *priv = encoder_hdmi->priv; bool is_hdmi2_sink = display_info->hdmi.scdc.supported; - unsigned int phy_freq; - unsigned int vclk_freq; - unsigned int venc_freq; - unsigned int hdmi_freq; + unsigned long long clock = mode->clock * 1000; + unsigned long long phy_freq; + unsigned long long vclk_freq; + unsigned long long venc_freq; + unsigned long long hdmi_freq; int vic = drm_match_cea_mode(mode); enum drm_mode_status status; @@ -144,12 +146,12 @@ static enum drm_mode_status meson_encoder_hdmi_mode_valid(struct drm_bridge *bri if (status != MODE_OK) return status; - return meson_vclk_dmt_supported_freq(priv, mode->clock); + return meson_vclk_dmt_supported_freq(priv, clock); /* Check against supported VIC modes */ } else if (!meson_venc_hdmi_supported_vic(vic)) return MODE_BAD; - vclk_freq = mode->clock; + vclk_freq = clock; /* For 420, pixel clock is half unlike venc clock */ if (drm_mode_is_420_only(display_info, mode) || @@ -179,7 +181,8 @@ static enum drm_mode_status meson_encoder_hdmi_mode_valid(struct drm_bridge *bri if (mode->flags & DRM_MODE_FLAG_DBLCLK) venc_freq /= 2; - dev_dbg(priv->dev, "%s: vclk:%d phy=%d venc=%d hdmi=%d\n", + dev_dbg(priv->dev, + "%s: vclk:%lluHz phy=%lluHz venc=%lluHz hdmi=%lluHz\n", __func__, phy_freq, vclk_freq, venc_freq, hdmi_freq); return meson_vclk_vic_supported_freq(priv, phy_freq, vclk_freq); diff --git a/drivers/gpu/drm/meson/meson_vclk.c b/drivers/gpu/drm/meson/meson_vclk.c index 2a82119eb58e..3325580d885d 100644 --- a/drivers/gpu/drm/meson/meson_vclk.c +++ b/drivers/gpu/drm/meson/meson_vclk.c @@ -110,7 +110,10 @@ #define HDMI_PLL_LOCK BIT(31) #define HDMI_PLL_LOCK_G12A (3 << 30) -#define FREQ_1000_1001(_freq) DIV_ROUND_CLOSEST(_freq * 1000, 1001) +#define PIXEL_FREQ_1000_1001(_freq) \ + DIV_ROUND_CLOSEST_ULL((_freq) * 1000ULL, 1001ULL) +#define PHY_FREQ_1000_1001(_freq) \ + (PIXEL_FREQ_1000_1001(DIV_ROUND_DOWN_ULL(_freq, 10ULL)) * 10) /* VID PLL Dividers */ enum { @@ -360,11 +363,11 @@ enum { }; struct meson_vclk_params { - unsigned int pll_freq; - unsigned int phy_freq; - unsigned int vclk_freq; - unsigned int venc_freq; - unsigned int pixel_freq; + unsigned long long pll_freq; + unsigned long long phy_freq; + unsigned long long vclk_freq; + unsigned long long venc_freq; + unsigned long long pixel_freq; unsigned int pll_od1; unsigned int pll_od2; unsigned int pll_od3; @@ -372,11 +375,11 @@ struct meson_vclk_params { unsigned int vclk_div; } params[] = { [MESON_VCLK_HDMI_ENCI_54000] = { - .pll_freq = 4320000, - .phy_freq = 270000, - .vclk_freq = 54000, - .venc_freq = 54000, - .pixel_freq = 54000, + .pll_freq = 4320000000, + .phy_freq = 270000000, + .vclk_freq = 54000000, + .venc_freq = 54000000, + .pixel_freq = 54000000, .pll_od1 = 4, .pll_od2 = 4, .pll_od3 = 1, @@ -384,11 +387,11 @@ struct meson_vclk_params { .vclk_div = 1, }, [MESON_VCLK_HDMI_DDR_54000] = { - .pll_freq = 4320000, - .phy_freq = 270000, - .vclk_freq = 54000, - .venc_freq = 54000, - .pixel_freq = 27000, + .pll_freq = 4320000000, + .phy_freq = 270000000, + .vclk_freq = 54000000, + .venc_freq = 54000000, + .pixel_freq = 27000000, .pll_od1 = 4, .pll_od2 = 4, .pll_od3 = 1, @@ -396,11 +399,11 @@ struct meson_vclk_params { .vclk_div = 1, }, [MESON_VCLK_HDMI_DDR_148500] = { - .pll_freq = 2970000, - .phy_freq = 742500, - .vclk_freq = 148500, - .venc_freq = 148500, - .pixel_freq = 74250, + .pll_freq = 2970000000, + .phy_freq = 742500000, + .vclk_freq = 148500000, + .venc_freq = 148500000, + .pixel_freq = 74250000, .pll_od1 = 4, .pll_od2 = 1, .pll_od3 = 1, @@ -408,11 +411,11 @@ struct meson_vclk_params { .vclk_div = 1, }, [MESON_VCLK_HDMI_74250] = { - .pll_freq = 2970000, - .phy_freq = 742500, - .vclk_freq = 74250, - .venc_freq = 74250, - .pixel_freq = 74250, + .pll_freq = 2970000000, + .phy_freq = 742500000, + .vclk_freq = 74250000, + .venc_freq = 74250000, + .pixel_freq = 74250000, .pll_od1 = 2, .pll_od2 = 2, .pll_od3 = 2, @@ -420,11 +423,11 @@ struct meson_vclk_params { .vclk_div = 1, }, [MESON_VCLK_HDMI_148500] = { - .pll_freq = 2970000, - .phy_freq = 1485000, - .vclk_freq = 148500, - .venc_freq = 148500, - .pixel_freq = 148500, + .pll_freq = 2970000000, + .phy_freq = 1485000000, + .vclk_freq = 148500000, + .venc_freq = 148500000, + .pixel_freq = 148500000, .pll_od1 = 1, .pll_od2 = 2, .pll_od3 = 2, @@ -432,11 +435,11 @@ struct meson_vclk_params { .vclk_div = 1, }, [MESON_VCLK_HDMI_297000] = { - .pll_freq = 5940000, - .phy_freq = 2970000, - .venc_freq = 297000, - .vclk_freq = 297000, - .pixel_freq = 297000, + .pll_freq = 5940000000, + .phy_freq = 2970000000, + .venc_freq = 297000000, + .vclk_freq = 297000000, + .pixel_freq = 297000000, .pll_od1 = 2, .pll_od2 = 1, .pll_od3 = 1, @@ -444,11 +447,11 @@ struct meson_vclk_params { .vclk_div = 2, }, [MESON_VCLK_HDMI_594000] = { - .pll_freq = 5940000, - .phy_freq = 5940000, - .venc_freq = 594000, - .vclk_freq = 594000, - .pixel_freq = 594000, + .pll_freq = 5940000000, + .phy_freq = 5940000000, + .venc_freq = 594000000, + .vclk_freq = 594000000, + .pixel_freq = 594000000, .pll_od1 = 1, .pll_od2 = 1, .pll_od3 = 2, @@ -456,11 +459,11 @@ struct meson_vclk_params { .vclk_div = 1, }, [MESON_VCLK_HDMI_594000_YUV420] = { - .pll_freq = 5940000, - .phy_freq = 2970000, - .venc_freq = 594000, - .vclk_freq = 594000, - .pixel_freq = 297000, + .pll_freq = 5940000000, + .phy_freq = 2970000000, + .venc_freq = 594000000, + .vclk_freq = 594000000, + .pixel_freq = 297000000, .pll_od1 = 2, .pll_od2 = 1, .pll_od3 = 1, @@ -617,16 +620,16 @@ static void meson_hdmi_pll_set_params(struct meson_drm *priv, unsigned int m, 3 << 20, pll_od_to_reg(od3) << 20); } -#define XTAL_FREQ 24000 +#define XTAL_FREQ (24 * 1000 * 1000) static unsigned int meson_hdmi_pll_get_m(struct meson_drm *priv, - unsigned int pll_freq) + unsigned long long pll_freq) { /* The GXBB PLL has a /2 pre-multiplier */ if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXBB)) - pll_freq /= 2; + pll_freq = DIV_ROUND_DOWN_ULL(pll_freq, 2); - return pll_freq / XTAL_FREQ; + return DIV_ROUND_DOWN_ULL(pll_freq, XTAL_FREQ); } #define HDMI_FRAC_MAX_GXBB 4096 @@ -635,12 +638,13 @@ static unsigned int meson_hdmi_pll_get_m(struct meson_drm *priv, static unsigned int meson_hdmi_pll_get_frac(struct meson_drm *priv, unsigned int m, - unsigned int pll_freq) + unsigned long long pll_freq) { - unsigned int parent_freq = XTAL_FREQ; + unsigned long long parent_freq = XTAL_FREQ; unsigned int frac_max = HDMI_FRAC_MAX_GXL; unsigned int frac_m; unsigned int frac; + u32 remainder; /* The GXBB PLL has a /2 pre-multiplier and a larger FRAC width */ if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXBB)) { @@ -652,11 +656,11 @@ static unsigned int meson_hdmi_pll_get_frac(struct meson_drm *priv, frac_max = HDMI_FRAC_MAX_G12A; /* We can have a perfect match !*/ - if (pll_freq / m == parent_freq && - pll_freq % m == 0) + if (div_u64_rem(pll_freq, m, &remainder) == parent_freq && + remainder == 0) return 0; - frac = div_u64((u64)pll_freq * (u64)frac_max, parent_freq); + frac = mul_u64_u64_div_u64(pll_freq, frac_max, parent_freq); frac_m = m * frac_max; if (frac_m > frac) return frac_max; @@ -666,7 +670,7 @@ static unsigned int meson_hdmi_pll_get_frac(struct meson_drm *priv, } static bool meson_hdmi_pll_validate_params(struct meson_drm *priv, - unsigned int m, + unsigned long long m, unsigned int frac) { if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXBB)) { @@ -694,7 +698,7 @@ static bool meson_hdmi_pll_validate_params(struct meson_drm *priv, } static bool meson_hdmi_pll_find_params(struct meson_drm *priv, - unsigned int freq, + unsigned long long freq, unsigned int *m, unsigned int *frac, unsigned int *od) @@ -706,7 +710,7 @@ static bool meson_hdmi_pll_find_params(struct meson_drm *priv, continue; *frac = meson_hdmi_pll_get_frac(priv, *m, freq * *od); - DRM_DEBUG_DRIVER("PLL params for %dkHz: m=%x frac=%x od=%d\n", + DRM_DEBUG_DRIVER("PLL params for %lluHz: m=%x frac=%x od=%d\n", freq, *m, *frac, *od); if (meson_hdmi_pll_validate_params(priv, *m, *frac)) @@ -718,7 +722,7 @@ static bool meson_hdmi_pll_find_params(struct meson_drm *priv, /* pll_freq is the frequency after the OD dividers */ enum drm_mode_status -meson_vclk_dmt_supported_freq(struct meson_drm *priv, unsigned int freq) +meson_vclk_dmt_supported_freq(struct meson_drm *priv, unsigned long long freq) { unsigned int od, m, frac; @@ -741,7 +745,7 @@ EXPORT_SYMBOL_GPL(meson_vclk_dmt_supported_freq); /* pll_freq is the frequency after the OD dividers */ static void meson_hdmi_pll_generic_set(struct meson_drm *priv, - unsigned int pll_freq) + unsigned long long pll_freq) { unsigned int od, m, frac, od1, od2, od3; @@ -756,7 +760,7 @@ static void meson_hdmi_pll_generic_set(struct meson_drm *priv, od1 = od / od2; } - DRM_DEBUG_DRIVER("PLL params for %dkHz: m=%x frac=%x od=%d/%d/%d\n", + DRM_DEBUG_DRIVER("PLL params for %lluHz: m=%x frac=%x od=%d/%d/%d\n", pll_freq, m, frac, od1, od2, od3); meson_hdmi_pll_set_params(priv, m, frac, od1, od2, od3); @@ -764,17 +768,18 @@ static void meson_hdmi_pll_generic_set(struct meson_drm *priv, return; } - DRM_ERROR("Fatal, unable to find parameters for PLL freq %d\n", + DRM_ERROR("Fatal, unable to find parameters for PLL freq %lluHz\n", pll_freq); } enum drm_mode_status -meson_vclk_vic_supported_freq(struct meson_drm *priv, unsigned int phy_freq, - unsigned int vclk_freq) +meson_vclk_vic_supported_freq(struct meson_drm *priv, + unsigned long long phy_freq, + unsigned long long vclk_freq) { int i; - DRM_DEBUG_DRIVER("phy_freq = %d vclk_freq = %d\n", + DRM_DEBUG_DRIVER("phy_freq = %lluHz vclk_freq = %lluHz\n", phy_freq, vclk_freq); /* Check against soc revision/package limits */ @@ -785,19 +790,19 @@ meson_vclk_vic_supported_freq(struct meson_drm *priv, unsigned int phy_freq, } for (i = 0 ; params[i].pixel_freq ; ++i) { - DRM_DEBUG_DRIVER("i = %d pixel_freq = %d alt = %d\n", + DRM_DEBUG_DRIVER("i = %d pixel_freq = %lluHz alt = %lluHz\n", i, params[i].pixel_freq, - FREQ_1000_1001(params[i].pixel_freq)); - DRM_DEBUG_DRIVER("i = %d phy_freq = %d alt = %d\n", + PIXEL_FREQ_1000_1001(params[i].pixel_freq)); + DRM_DEBUG_DRIVER("i = %d phy_freq = %lluHz alt = %lluHz\n", i, params[i].phy_freq, - FREQ_1000_1001(params[i].phy_freq/10)*10); + PHY_FREQ_1000_1001(params[i].phy_freq)); /* Match strict frequency */ if (phy_freq == params[i].phy_freq && vclk_freq == params[i].vclk_freq) return MODE_OK; /* Match 1000/1001 variant */ - if (phy_freq == (FREQ_1000_1001(params[i].phy_freq/10)*10) && - vclk_freq == FREQ_1000_1001(params[i].vclk_freq)) + if (phy_freq == PHY_FREQ_1000_1001(params[i].phy_freq) && + vclk_freq == PIXEL_FREQ_1000_1001(params[i].vclk_freq)) return MODE_OK; } @@ -805,8 +810,9 @@ meson_vclk_vic_supported_freq(struct meson_drm *priv, unsigned int phy_freq, } EXPORT_SYMBOL_GPL(meson_vclk_vic_supported_freq); -static void meson_vclk_set(struct meson_drm *priv, unsigned int pll_base_freq, - unsigned int od1, unsigned int od2, unsigned int od3, +static void meson_vclk_set(struct meson_drm *priv, + unsigned long long pll_base_freq, unsigned int od1, + unsigned int od2, unsigned int od3, unsigned int vid_pll_div, unsigned int vclk_div, unsigned int hdmi_tx_div, unsigned int venc_div, bool hdmi_use_enci, bool vic_alternate_clock) @@ -826,15 +832,15 @@ static void meson_vclk_set(struct meson_drm *priv, unsigned int pll_base_freq, meson_hdmi_pll_generic_set(priv, pll_base_freq); } else if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXBB)) { switch (pll_base_freq) { - case 2970000: + case 2970000000: m = 0x3d; frac = vic_alternate_clock ? 0xd02 : 0xe00; break; - case 4320000: + case 4320000000: m = vic_alternate_clock ? 0x59 : 0x5a; frac = vic_alternate_clock ? 0xe8f : 0; break; - case 5940000: + case 5940000000: m = 0x7b; frac = vic_alternate_clock ? 0xa05 : 0xc00; break; @@ -844,15 +850,15 @@ static void meson_vclk_set(struct meson_drm *priv, unsigned int pll_base_freq, } else if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXM) || meson_vpu_is_compatible(priv, VPU_COMPATIBLE_GXL)) { switch (pll_base_freq) { - case 2970000: + case 2970000000: m = 0x7b; frac = vic_alternate_clock ? 0x281 : 0x300; break; - case 4320000: + case 4320000000: m = vic_alternate_clock ? 0xb3 : 0xb4; frac = vic_alternate_clock ? 0x347 : 0; break; - case 5940000: + case 5940000000: m = 0xf7; frac = vic_alternate_clock ? 0x102 : 0x200; break; @@ -861,15 +867,15 @@ static void meson_vclk_set(struct meson_drm *priv, unsigned int pll_base_freq, meson_hdmi_pll_set_params(priv, m, frac, od1, od2, od3); } else if (meson_vpu_is_compatible(priv, VPU_COMPATIBLE_G12A)) { switch (pll_base_freq) { - case 2970000: + case 2970000000: m = 0x7b; frac = vic_alternate_clock ? 0x140b4 : 0x18000; break; - case 4320000: + case 4320000000: m = vic_alternate_clock ? 0xb3 : 0xb4; frac = vic_alternate_clock ? 0x1a3ee : 0; break; - case 5940000: + case 5940000000: m = 0xf7; frac = vic_alternate_clock ? 0x8148 : 0x10000; break; @@ -1025,14 +1031,14 @@ static void meson_vclk_set(struct meson_drm *priv, unsigned int pll_base_freq, } void meson_vclk_setup(struct meson_drm *priv, unsigned int target, - unsigned int phy_freq, unsigned int vclk_freq, - unsigned int venc_freq, unsigned int dac_freq, + unsigned long long phy_freq, unsigned long long vclk_freq, + unsigned long long venc_freq, unsigned long long dac_freq, bool hdmi_use_enci) { bool vic_alternate_clock = false; - unsigned int freq; - unsigned int hdmi_tx_div; - unsigned int venc_div; + unsigned long long freq; + unsigned long long hdmi_tx_div; + unsigned long long venc_div; if (target == MESON_VCLK_TARGET_CVBS) { meson_venci_cvbs_clock_config(priv); @@ -1052,27 +1058,27 @@ void meson_vclk_setup(struct meson_drm *priv, unsigned int target, return; } - hdmi_tx_div = vclk_freq / dac_freq; + hdmi_tx_div = DIV_ROUND_DOWN_ULL(vclk_freq, dac_freq); if (hdmi_tx_div == 0) { - pr_err("Fatal Error, invalid HDMI-TX freq %d\n", + pr_err("Fatal Error, invalid HDMI-TX freq %lluHz\n", dac_freq); return; } - venc_div = vclk_freq / venc_freq; + venc_div = DIV_ROUND_DOWN_ULL(vclk_freq, venc_freq); if (venc_div == 0) { - pr_err("Fatal Error, invalid HDMI venc freq %d\n", + pr_err("Fatal Error, invalid HDMI venc freq %lluHz\n", venc_freq); return; } for (freq = 0 ; params[freq].pixel_freq ; ++freq) { if ((phy_freq == params[freq].phy_freq || - phy_freq == FREQ_1000_1001(params[freq].phy_freq/10)*10) && + phy_freq == PHY_FREQ_1000_1001(params[freq].phy_freq)) && (vclk_freq == params[freq].vclk_freq || - vclk_freq == FREQ_1000_1001(params[freq].vclk_freq))) { + vclk_freq == PIXEL_FREQ_1000_1001(params[freq].vclk_freq))) { if (vclk_freq != params[freq].vclk_freq) vic_alternate_clock = true; else @@ -1098,7 +1104,8 @@ void meson_vclk_setup(struct meson_drm *priv, unsigned int target, } if (!params[freq].pixel_freq) { - pr_err("Fatal Error, invalid HDMI vclk freq %d\n", vclk_freq); + pr_err("Fatal Error, invalid HDMI vclk freq %lluHz\n", + vclk_freq); return; } diff --git a/drivers/gpu/drm/meson/meson_vclk.h b/drivers/gpu/drm/meson/meson_vclk.h index 60617aaf18dd..7ac55744e574 100644 --- a/drivers/gpu/drm/meson/meson_vclk.h +++ b/drivers/gpu/drm/meson/meson_vclk.h @@ -20,17 +20,18 @@ enum { }; /* 27MHz is the CVBS Pixel Clock */ -#define MESON_VCLK_CVBS 27000 +#define MESON_VCLK_CVBS (27 * 1000 * 1000) enum drm_mode_status -meson_vclk_dmt_supported_freq(struct meson_drm *priv, unsigned int freq); +meson_vclk_dmt_supported_freq(struct meson_drm *priv, unsigned long long freq); enum drm_mode_status -meson_vclk_vic_supported_freq(struct meson_drm *priv, unsigned int phy_freq, - unsigned int vclk_freq); +meson_vclk_vic_supported_freq(struct meson_drm *priv, + unsigned long long phy_freq, + unsigned long long vclk_freq); void meson_vclk_setup(struct meson_drm *priv, unsigned int target, - unsigned int phy_freq, unsigned int vclk_freq, - unsigned int venc_freq, unsigned int dac_freq, + unsigned long long phy_freq, unsigned long long vclk_freq, + unsigned long long venc_freq, unsigned long long dac_freq, bool hdmi_use_enci); #endif /* __MESON_VCLK_H */ -- cgit v1.2.3 From 095c8e61f4c71cd4630ee11a82e82cc341b38464 Mon Sep 17 00:00:00 2001 From: Hugo Villeneuve Date: Thu, 17 Apr 2025 15:55:06 -0400 Subject: drm: panel: jd9365da: fix reset signal polarity in unprepare commit a8972d5a49b4 ("drm: panel: jd9365da-h3: fix reset signal polarity") fixed reset signal polarity in jadard_dsi_probe() and jadard_prepare(). It was not done in jadard_unprepare() because of an incorrect assumption about reset line handling in power off mode. After looking into the datasheet, it now appears that before disabling regulators, the reset line is deasserted first, and if reset_before_power_off_vcioo is true, then the reset line is asserted. Fix reset polarity by inverting gpiod_set_value() second argument in in jadard_unprepare(). Fixes: 6b818c533dd8 ("drm: panel: Add Jadard JD9365DA-H3 DSI panel") Fixes: 2b976ad760dc ("drm/panel: jd9365da: Support for kd101ne3-40ti MIPI-DSI panel") Fixes: a8972d5a49b4 ("drm: panel: jd9365da-h3: fix reset signal polarity") Cc: stable@vger.kernel.org Signed-off-by: Hugo Villeneuve Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250417195507.778731-1-hugo@hugovil.com Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250417195507.778731-1-hugo@hugovil.com --- drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c b/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c index 7d68a8acfe2e..eb0f8373258c 100644 --- a/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c +++ b/drivers/gpu/drm/panel/panel-jadard-jd9365da-h3.c @@ -129,11 +129,11 @@ static int jadard_unprepare(struct drm_panel *panel) { struct jadard *jadard = panel_to_jadard(panel); - gpiod_set_value(jadard->reset, 1); + gpiod_set_value(jadard->reset, 0); msleep(120); if (jadard->desc->reset_before_power_off_vcioo) { - gpiod_set_value(jadard->reset, 0); + gpiod_set_value(jadard->reset, 1); usleep_range(1000, 2000); } -- cgit v1.2.3 From 3d7aa0c7b4e96cd460826d932e44710cdeb3378b Mon Sep 17 00:00:00 2001 From: Richard Weinberger Date: Fri, 18 Apr 2025 10:02:50 +0200 Subject: nvmet: fix out-of-bounds access in nvmet_enable_port When trying to enable a port that has no transport configured yet, nvmet_enable_port() uses NVMF_TRTYPE_MAX (255) to query the transports array, causing an out-of-bounds access: [ 106.058694] BUG: KASAN: global-out-of-bounds in nvmet_enable_port+0x42/0x1da [ 106.058719] Read of size 8 at addr ffffffff89dafa58 by task ln/632 [...] [ 106.076026] nvmet: transport type 255 not supported Since commit 200adac75888, NVMF_TRTYPE_MAX is the default state as configured by nvmet_ports_make(). Avoid this by checking for NVMF_TRTYPE_MAX before proceeding. Fixes: 200adac75888 ("nvme: Add PCI transport type") Signed-off-by: Richard Weinberger Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Damien Le Moal --- drivers/nvme/target/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 71f8d06998d6..245475c43127 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -324,6 +324,9 @@ int nvmet_enable_port(struct nvmet_port *port) lockdep_assert_held(&nvmet_config_sem); + if (port->disc_addr.trtype == NVMF_TRTYPE_MAX) + return -EINVAL; + ops = nvmet_transports[port->disc_addr.trtype]; if (!ops) { up_write(&nvmet_config_sem); -- cgit v1.2.3 From 30a41ed32d3088cd0d682a13d7f30b23baed7e93 Mon Sep 17 00:00:00 2001 From: Fiona Klute Date: Wed, 16 Apr 2025 12:24:13 +0200 Subject: net: phy: microchip: force IRQ polling mode for lan88xx With lan88xx based devices the lan78xx driver can get stuck in an interrupt loop while bringing the device up, flooding the kernel log with messages like the following: lan78xx 2-3:1.0 enp1s0u3: kevent 4 may have been dropped Removing interrupt support from the lan88xx PHY driver forces the driver to use polling instead, which avoids the problem. The issue has been observed with Raspberry Pi devices at least since 4.14 (see [1], bug report for their downstream kernel), as well as with Nvidia devices [2] in 2020, where disabling interrupts was the vendor-suggested workaround (together with the claim that phylib changes in 4.9 made the interrupt handling in lan78xx incompatible). Iperf reports well over 900Mbits/sec per direction with client in --dualtest mode, so there does not seem to be a significant impact on throughput (lan88xx device connected via switch to the peer). [1] https://github.com/raspberrypi/linux/issues/2447 [2] https://forums.developer.nvidia.com/t/jetson-xavier-and-lan7800-problem/142134/11 Link: https://lore.kernel.org/0901d90d-3f20-4a10-b680-9c978e04ddda@lunn.ch Fixes: 792aec47d59d ("add microchip LAN88xx phy driver") Signed-off-by: Fiona Klute Cc: kernel-list@raspberrypi.com Cc: stable@vger.kernel.org Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250416102413.30654-1-fiona.klute@gmx.de Signed-off-by: Paolo Abeni --- drivers/net/phy/microchip.c | 46 +++------------------------------------------ 1 file changed, 3 insertions(+), 43 deletions(-) diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c index 0e17cc458efd..93de88c1c8fd 100644 --- a/drivers/net/phy/microchip.c +++ b/drivers/net/phy/microchip.c @@ -37,47 +37,6 @@ static int lan88xx_write_page(struct phy_device *phydev, int page) return __phy_write(phydev, LAN88XX_EXT_PAGE_ACCESS, page); } -static int lan88xx_phy_config_intr(struct phy_device *phydev) -{ - int rc; - - if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { - /* unmask all source and clear them before enable */ - rc = phy_write(phydev, LAN88XX_INT_MASK, 0x7FFF); - rc = phy_read(phydev, LAN88XX_INT_STS); - rc = phy_write(phydev, LAN88XX_INT_MASK, - LAN88XX_INT_MASK_MDINTPIN_EN_ | - LAN88XX_INT_MASK_LINK_CHANGE_); - } else { - rc = phy_write(phydev, LAN88XX_INT_MASK, 0); - if (rc) - return rc; - - /* Ack interrupts after they have been disabled */ - rc = phy_read(phydev, LAN88XX_INT_STS); - } - - return rc < 0 ? rc : 0; -} - -static irqreturn_t lan88xx_handle_interrupt(struct phy_device *phydev) -{ - int irq_status; - - irq_status = phy_read(phydev, LAN88XX_INT_STS); - if (irq_status < 0) { - phy_error(phydev); - return IRQ_NONE; - } - - if (!(irq_status & LAN88XX_INT_STS_LINK_CHANGE_)) - return IRQ_NONE; - - phy_trigger_machine(phydev); - - return IRQ_HANDLED; -} - static int lan88xx_suspend(struct phy_device *phydev) { struct lan88xx_priv *priv = phydev->priv; @@ -528,8 +487,9 @@ static struct phy_driver microchip_phy_driver[] = { .config_aneg = lan88xx_config_aneg, .link_change_notify = lan88xx_link_change_notify, - .config_intr = lan88xx_phy_config_intr, - .handle_interrupt = lan88xx_handle_interrupt, + /* Interrupt handling is broken, do not define related + * functions to force polling. + */ .suspend = lan88xx_suspend, .resume = genphy_resume, -- cgit v1.2.3 From cae5572ec9261f752af834cdaaf5a0ba0afcf256 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 22 Apr 2025 21:40:34 +1000 Subject: dma-mapping: Fix warning reported for missing prototype lkp reported a warning about missing prototype for a recent patch. The kernel-doc style comments are out of sync, move them to the right function. Cc: Marek Szyprowski Cc: Christoph Hellwig Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504190615.g9fANxHw-lkp@intel.com/ Signed-off-by: Balbir Singh [mszyprow: reformatted subject] Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250422114034.3535515-1-balbirs@nvidia.com --- kernel/dma/mapping.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 67da08fa6723..051a32988040 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -910,14 +910,6 @@ int dma_set_coherent_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_coherent_mask); -/** - * dma_addressing_limited - return if the device is addressing limited - * @dev: device to check - * - * Return %true if the devices DMA mask is too small to address all memory in - * the system, else %false. Lack of addressing bits is the prime reason for - * bounce buffering, but might not be the only one. - */ static bool __dma_addressing_limited(struct device *dev) { const struct dma_map_ops *ops = get_dma_ops(dev); @@ -931,6 +923,14 @@ static bool __dma_addressing_limited(struct device *dev) return !dma_direct_all_ram_mapped(dev); } +/** + * dma_addressing_limited - return if the device is addressing limited + * @dev: device to check + * + * Return %true if the devices DMA mask is too small to address all memory in + * the system, else %false. Lack of addressing bits is the prime reason for + * bounce buffering, but might not be the only one. + */ bool dma_addressing_limited(struct device *dev) { if (!__dma_addressing_limited(dev)) -- cgit v1.2.3 From 9e8d1013b0c38910cbc9e60de74dbe883878469d Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 16 Apr 2025 18:01:25 +0200 Subject: net: selftests: initialize TCP header and skb payload with zero Zero-initialize TCP header via memset() to avoid garbage values that may affect checksum or behavior during test transmission. Also zero-fill allocated payload and padding regions using memset() after skb_put(), ensuring deterministic content for all outgoing test packets. Fixes: 3e1e58d64c3d ("net: add generic selftest support") Signed-off-by: Oleksij Rempel Cc: stable@vger.kernel.org Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250416160125.2914724-1-o.rempel@pengutronix.de Signed-off-by: Paolo Abeni --- net/core/selftests.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/core/selftests.c b/net/core/selftests.c index e99ae983fca9..35f807ea9952 100644 --- a/net/core/selftests.c +++ b/net/core/selftests.c @@ -100,10 +100,10 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, ehdr->h_proto = htons(ETH_P_IP); if (attr->tcp) { + memset(thdr, 0, sizeof(*thdr)); thdr->source = htons(attr->sport); thdr->dest = htons(attr->dport); thdr->doff = sizeof(struct tcphdr) / 4; - thdr->check = 0; } else { uhdr->source = htons(attr->sport); uhdr->dest = htons(attr->dport); @@ -144,10 +144,18 @@ static struct sk_buff *net_test_get_skb(struct net_device *ndev, attr->id = net_test_next_id; shdr->id = net_test_next_id++; - if (attr->size) - skb_put(skb, attr->size); - if (attr->max_size && attr->max_size > skb->len) - skb_put(skb, attr->max_size - skb->len); + if (attr->size) { + void *payload = skb_put(skb, attr->size); + + memset(payload, 0, attr->size); + } + + if (attr->max_size && attr->max_size > skb->len) { + size_t pad_len = attr->max_size - skb->len; + void *pad = skb_put(skb, pad_len); + + memset(pad, 0, pad_len); + } skb->csum = 0; skb->ip_summed = CHECKSUM_PARTIAL; -- cgit v1.2.3 From c03a49f3093a4903c8a93c8b5c9a297b5343b169 Mon Sep 17 00:00:00 2001 From: Justin Iurman Date: Wed, 16 Apr 2025 18:07:16 +0200 Subject: net: lwtunnel: disable BHs when required In lwtunnel_{output|xmit}(), dev_xmit_recursion() may be called in preemptible scope for PREEMPT kernels. This patch disables BHs before calling dev_xmit_recursion(). BHs are re-enabled only at the end, since we must ensure the same CPU is used for both dev_xmit_recursion_inc() and dev_xmit_recursion_dec() (and any other recursion levels in some cases) in order to maintain valid per-cpu counters. Reported-by: Alexei Starovoitov Closes: https://lore.kernel.org/netdev/CAADnVQJFWn3dBFJtY+ci6oN1pDFL=TzCmNbRgey7MdYxt_AP2g@mail.gmail.com/ Reported-by: Eduard Zingerman Closes: https://lore.kernel.org/netdev/m2h62qwf34.fsf@gmail.com/ Fixes: 986ffb3a57c5 ("net: lwtunnel: fix recursion loops") Signed-off-by: Justin Iurman Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250416160716.8823-1-justin.iurman@uliege.be Signed-off-by: Paolo Abeni --- net/core/lwtunnel.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index e39a459540ec..60f27cb4e54f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -333,6 +333,8 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) struct dst_entry *dst; int ret; + local_bh_disable(); + if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); @@ -348,8 +350,10 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); @@ -364,11 +368,13 @@ int lwtunnel_output(struct net *net, struct sock *sk, struct sk_buff *skb) if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_output); @@ -380,6 +386,8 @@ int lwtunnel_xmit(struct sk_buff *skb) struct dst_entry *dst; int ret; + local_bh_disable(); + if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); @@ -396,8 +404,10 @@ int lwtunnel_xmit(struct sk_buff *skb) lwtstate = dst->lwtstate; if (lwtstate->type == LWTUNNEL_ENCAP_NONE || - lwtstate->type > LWTUNNEL_ENCAP_MAX) - return 0; + lwtstate->type > LWTUNNEL_ENCAP_MAX) { + ret = 0; + goto out; + } ret = -EOPNOTSUPP; rcu_read_lock(); @@ -412,11 +422,13 @@ int lwtunnel_xmit(struct sk_buff *skb) if (ret == -EOPNOTSUPP) goto drop; - return ret; + goto out; drop: kfree_skb(skb); +out: + local_bh_enable(); return ret; } EXPORT_SYMBOL_GPL(lwtunnel_xmit); @@ -428,6 +440,8 @@ int lwtunnel_input(struct sk_buff *skb) struct dst_entry *dst; int ret; + DEBUG_NET_WARN_ON_ONCE(!in_softirq()); + if (dev_xmit_recursion()) { net_crit_ratelimited("%s(): recursion limit reached on datapath\n", __func__); -- cgit v1.2.3 From bd7c19331913b955a7823e6315ca16bbcc65aeff Mon Sep 17 00:00:00 2001 From: Carlos Maiolino Date: Tue, 22 Apr 2025 14:54:54 +0200 Subject: XFS: fix zoned gc threshold math for 32-bit arches xfs_zoned_need_gc makes use of mult_frac() to calculate the threshold for triggering the zoned garbage collector, but, turns out mult_frac() doesn't properly work with 64-bit data types and this caused build failures on some 32-bit architectures. Fix this by essentially open coding mult_frac() in a 64-bit friendly way. Notice we don't need to bother with counters underflow here because xfs_estimate_freecounter() will always return a positive value, as it leverages percpu_counter_read_positive to read such counters. Fixes: 845abeb1f06a ("xfs: add tunable threshold parameter for triggering zone GC") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202504181233.F7D9Atra-lkp@intel.com/ Signed-off-by: Carlos Maiolino Tested-by: Guenter Roeck Reviewed-by: Christoph Hellwig Reviewed-by: Hans Holmberg Signed-off-by: Carlos Maiolino --- fs/xfs/xfs_zone_gc.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index 8c541ca71872..81c94dd1d596 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -170,7 +170,8 @@ bool xfs_zoned_need_gc( struct xfs_mount *mp) { - s64 available, free; + s64 available, free, threshold; + s32 remainder; if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) return false; @@ -183,7 +184,12 @@ xfs_zoned_need_gc( return true; free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); - if (available < mult_frac(free, mp->m_zonegc_low_space, 100)) + + threshold = div_s64_rem(free, 100, &remainder); + threshold = threshold * mp->m_zonegc_low_space + + remainder * div_s64(mp->m_zonegc_low_space, 100); + + if (available < threshold) return true; return false; -- cgit v1.2.3 From f0447f80aec83f1699d599c94618bb5c323963e6 Mon Sep 17 00:00:00 2001 From: Hans Holmberg Date: Tue, 22 Apr 2025 11:50:07 +0000 Subject: xfs: remove duplicate Zoned Filesystems sections in admin-guide Remove the duplicated section and while at it, turn spaces into tabs. Signed-off-by: Hans Holmberg Reviewed-by: Christoph Hellwig Reviewed-by: Carlos Maiolino Fixes: c7b67ddc3c99 ("xfs: document zoned rt specifics in admin-guide") Signed-off-by: Carlos Maiolino --- Documentation/admin-guide/xfs.rst | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 3e76276bd488..5becb441c3cb 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -562,7 +562,7 @@ The interesting knobs for XFS workqueues are as follows: Zoned Filesystems ================= -For zoned file systems, the following attribute is exposed in: +For zoned file systems, the following attributes are exposed in: /sys/fs/xfs//zoned/ @@ -572,23 +572,10 @@ For zoned file systems, the following attribute is exposed in: is limited by the capabilities of the backing zoned device, file system size and the max_open_zones mount option. -Zoned Filesystems -================= - -For zoned file systems, the following attributes are exposed in: - - /sys/fs/xfs//zoned/ - - max_open_zones (Min: 1 Default: Varies Max: UINTMAX) - This read-only attribute exposes the maximum number of open zones - available for data placement. The value is determined at mount time and - is limited by the capabilities of the backing zoned device, file system - size and the max_open_zones mount option. - - zonegc_low_space (Min: 0 Default: 0 Max: 100) - Define a percentage for how much of the unused space that GC should keep - available for writing. A high value will reclaim more of the space - occupied by unused blocks, creating a larger buffer against write - bursts at the cost of increased write amplification. Regardless - of this value, garbage collection will always aim to free a minimum - amount of blocks to keep max_open_zones open for data placement purposes. + zonegc_low_space (Min: 0 Default: 0 Max: 100) + Define a percentage for how much of the unused space that GC should keep + available for writing. A high value will reclaim more of the space + occupied by unused blocks, creating a larger buffer against write + bursts at the cost of increased write amplification. Regardless + of this value, garbage collection will always aim to free a minimum + amount of blocks to keep max_open_zones open for data placement purposes. -- cgit v1.2.3 From 30d68cb0c37ebe2dc63aa1d46a28b9163e61caa2 Mon Sep 17 00:00:00 2001 From: Frederick Lawler Date: Thu, 27 Mar 2025 11:09:11 -0500 Subject: ima: process_measurement() needlessly takes inode_lock() on MAY_READ On IMA policy update, if a measure rule exists in the policy, IMA_MEASURE is set for ima_policy_flags which makes the violation_check variable always true. Coupled with a no-action on MAY_READ for a FILE_CHECK call, we're always taking the inode_lock(). This becomes a performance problem for extremely heavy read-only workloads. Therefore, prevent this only in the case there's no action to be taken. Signed-off-by: Frederick Lawler Acked-by: Roberto Sassu Signed-off-by: Mimi Zohar --- security/integrity/ima/ima_main.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f3e7ac513db3..f99ab1a3b0f0 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -245,7 +245,9 @@ static int process_measurement(struct file *file, const struct cred *cred, &allowed_algos); violation_check = ((func == FILE_CHECK || func == MMAP_CHECK || func == MMAP_CHECK_REQPROT) && - (ima_policy_flag & IMA_MEASURE)); + (ima_policy_flag & IMA_MEASURE) && + ((action & IMA_MEASURE) || + (file->f_mode & FMODE_WRITE))); if (!action && !violation_check) return 0; -- cgit v1.2.3 From 89461db349cc00816c01d55507d511466b3b7151 Mon Sep 17 00:00:00 2001 From: Chen-Yu Tsai Date: Mon, 21 Apr 2025 16:39:29 +0800 Subject: dma-coherent: Warn if OF reserved memory is beyond current coherent DMA mask When a reserved memory region described in the device tree is attached to a device, it is expected that the device's limitations are correctly included in that description. However, if the device driver failed to implement DMA address masking or addressing beyond the default 32 bits (on arm64), then bad things could happen because the DMA address was truncated, such as playing back audio with no actual audio coming out, or DMA overwriting random blocks of kernel memory. Check against the coherent DMA mask when the memory regions are attached to the device. Give a warning when the memory region can not be covered by the mask. A warning instead of a hard error was chosen, because it is possible that existing drivers could be working fine even if they forgot to extend the coherent DMA mask. Signed-off-by: Chen-Yu Tsai Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20250421083930.374173-1-wenst@chromium.org --- kernel/dma/coherent.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/dma/coherent.c b/kernel/dma/coherent.c index 3b2bdca9f1d4..77c8d9487a9a 100644 --- a/kernel/dma/coherent.c +++ b/kernel/dma/coherent.c @@ -336,16 +336,22 @@ static phys_addr_t dma_reserved_default_memory_size __initdata; static int rmem_dma_device_init(struct reserved_mem *rmem, struct device *dev) { - if (!rmem->priv) { - struct dma_coherent_mem *mem; + struct dma_coherent_mem *mem = rmem->priv; + if (!mem) { mem = dma_init_coherent_memory(rmem->base, rmem->base, rmem->size, true); if (IS_ERR(mem)) return PTR_ERR(mem); rmem->priv = mem; } - dma_assign_coherent_memory(dev, rmem->priv); + + /* Warn if the device potentially can't use the reserved memory */ + if (mem->device_base + rmem->size - 1 > + min_not_zero(dev->coherent_dma_mask, dev->bus_dma_limit)) + dev_warn(dev, "reserved memory is beyond device's set DMA address range\n"); + + dma_assign_coherent_memory(dev, mem); return 0; } -- cgit v1.2.3 From 4ea404fdbc39971814cd3eb36b43c11fb6f32e17 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 22 Apr 2025 16:43:29 +0100 Subject: lib: Ensure prime numbers tests are included in KUnit test runs When the select of PRIME_MUMBERS was removed from it's KUnit test Kconfig nothing was added to the KUnit configs, meaning that when run via the KUnit runner the tests are neither built nor run. Add PRIME_NUMBERS to all_tests.config so they are enabled when the KUnit runner builds the kernel. Fixes: 3f2925174f8b ("lib/prime_numbers: KUnit test should not select PRIME_NUMBERS") Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250422-lib-fix-prime-numbers-kunit-v1-1-4278c1d4a4ae@kernel.org Signed-off-by: Kees Cook --- tools/testing/kunit/configs/all_tests.config | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/kunit/configs/all_tests.config b/tools/testing/kunit/configs/all_tests.config index cdd9782f9646..7bb885b0c32d 100644 --- a/tools/testing/kunit/configs/all_tests.config +++ b/tools/testing/kunit/configs/all_tests.config @@ -43,6 +43,8 @@ CONFIG_REGMAP_BUILD=y CONFIG_AUDIT=y +CONFIG_PRIME_NUMBERS=y + CONFIG_SECURITY=y CONFIG_SECURITY_APPARMOR=y CONFIG_SECURITY_LANDLOCK=y -- cgit v1.2.3 From 7ffe3de53a885dbb5836541c2178bd07d1bad7df Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:15 -0700 Subject: fs/buffer: split locking for pagecache lookups Callers of __find_get_block() may or may not allow for blocking semantics, and is currently assumed that it will not. Layout two paths based on this. The the private_lock scheme will continued to be used for atomic contexts. Otherwise take the folio lock instead, which protects the buffers, such as vs migration and try_to_free_buffers(). Per the "hack idea", the latter can alleviate contention on the private_lock for bdev mappings. For reasons of determinism and avoid making bugs hard to reproduce, the trylocking is not attempted. No change in semantics. All lookup users still take the spinlock. Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-2-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/buffer.c | 41 +++++++++++++++++++++++++---------------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index c7abb4a029dc..f8fcffdbe5d9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) } EXPORT_SYMBOL(end_buffer_write_sync); -/* - * Various filesystems appear to want __find_get_block to be non-blocking. - * But it's the page lock which protects the buffers. To get around this, - * we get exclusion from try_to_free_buffers with the blockdev mapping's - * i_private_lock. - * - * Hack idea: for the blockdev mapping, i_private_lock contention - * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take i_private_lock. - */ static struct buffer_head * -__find_get_block_slow(struct block_device *bdev, sector_t block) +__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) { struct address_space *bd_mapping = bdev->bd_mapping; const int blkbits = bd_mapping->host->i_blkbits; @@ -204,7 +194,16 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->i_private_lock); + /* + * Folio lock protects the buffers. Callers that cannot block + * will fallback to serializing vs try_to_free_buffers() via + * the i_private_lock. + */ + if (atomic) + spin_lock(&bd_mapping->i_private_lock); + else + folio_lock(folio); + head = folio_buffers(folio); if (!head) goto out_unlock; @@ -236,7 +235,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << blkbits); } out_unlock: - spin_unlock(&bd_mapping->i_private_lock); + if (atomic) + spin_unlock(&bd_mapping->i_private_lock); + else + folio_unlock(folio); folio_put(folio); out: return ret; @@ -1388,14 +1390,15 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) * it in the LRU and mark it as accessed. If it is not present then return * NULL */ -struct buffer_head * -__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +static struct buffer_head * +find_get_block_common(struct block_device *bdev, sector_t block, + unsigned size, bool atomic) { struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { /* __find_get_block_slow will mark the page accessed */ - bh = __find_get_block_slow(bdev, block); + bh = __find_get_block_slow(bdev, block, atomic); if (bh) bh_lru_install(bh); } else @@ -1403,6 +1406,12 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) return bh; } + +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ + return find_get_block_common(bdev, block, size, true); +} EXPORT_SYMBOL(__find_get_block); /** -- cgit v1.2.3 From 559a0d7bf1a6e5a5d0ad4ab4b0089145042e3109 Mon Sep 17 00:00:00 2001 From: Viacheslav Dubeyko Date: Thu, 17 Apr 2025 15:35:07 -0700 Subject: MAINTAINERS: add HFS/HFS+ maintainers Both the hfs and hfsplus filesystem have been orphaned since at least 2014, i.e., over 10 years. However, HFS/HFS+ driver needs to stay for Debian Ports as otherwise we won't be able to boot PowerMacs using GRUB because GRUB won't be usable anymore on PowerMacs with HFS/HFS+ being removed from the kernel. This patch proposes to add Viacheslav Dubeyko and John Paul Adrian Glaubitz as maintainers of HFS/HFS+ driver. Signed-off-by: Viacheslav Dubeyko Link: https://lore.kernel.org/20250417223507.1097186-1-slava@dubeyko.com Signed-off-by: Christian Brauner --- MAINTAINERS | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index fa1e04e87d1d..b8d1e41c27f6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10457,14 +10457,18 @@ S: Supported F: drivers/infiniband/hw/hfi1 HFS FILESYSTEM +M: Viacheslav Dubeyko +M: John Paul Adrian Glaubitz L: linux-fsdevel@vger.kernel.org -S: Orphan +S: Maintained F: Documentation/filesystems/hfs.rst F: fs/hfs/ HFSPLUS FILESYSTEM +M: Viacheslav Dubeyko +M: John Paul Adrian Glaubitz L: linux-fsdevel@vger.kernel.org -S: Orphan +S: Maintained F: Documentation/filesystems/hfsplus.rst F: fs/hfsplus/ -- cgit v1.2.3 From 2814a7d3d2ff5d2cdd22936f641f758fdb971fa0 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:16 -0700 Subject: fs/buffer: introduce sleeping flavors for pagecache lookups Add __find_get_block_nonatomic() and sb_find_get_block_nonatomic() calls for which users will be converted where safe. These versions will take the folio lock instead of the mapping's private_lock. Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-3-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/buffer.c | 9 +++++++++ include/linux/buffer_head.h | 8 ++++++++ 2 files changed, 17 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index f8fcffdbe5d9..5b1d74c818e9 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1414,6 +1414,15 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) } EXPORT_SYMBOL(__find_get_block); +/* same as __find_get_block() but allows sleeping contexts */ +struct buffer_head * +__find_get_block_nonatomic(struct block_device *bdev, sector_t block, + unsigned size) +{ + return find_get_block_common(bdev, block, size, false); +} +EXPORT_SYMBOL(__find_get_block_nonatomic); + /** * bdev_getblk - Get a buffer_head in a block device's buffer cache. * @bdev: The block device. diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index f0a4ad7839b6..c791aa9a08da 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -222,6 +222,8 @@ void __wait_on_buffer(struct buffer_head *); wait_queue_head_t *bh_waitq_head(struct buffer_head *bh); struct buffer_head *__find_get_block(struct block_device *bdev, sector_t block, unsigned size); +struct buffer_head *__find_get_block_nonatomic(struct block_device *bdev, + sector_t block, unsigned size); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp); void __brelse(struct buffer_head *); @@ -397,6 +399,12 @@ sb_find_get_block(struct super_block *sb, sector_t block) return __find_get_block(sb->s_bdev, block, sb->s_blocksize); } +static inline struct buffer_head * +sb_find_get_block_nonatomic(struct super_block *sb, sector_t block) +{ + return __find_get_block_nonatomic(sb->s_bdev, block, sb->s_blocksize); +} + static inline void map_bh(struct buffer_head *bh, struct super_block *sb, sector_t block) { -- cgit v1.2.3 From 5b67d43976828dea2394eae2556b369bb7a61f64 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:17 -0700 Subject: fs/buffer: use sleeping version of __find_get_block() Convert to the new nonatomic flavor to benefit from potential performance benefits and adapt in the future vs migration such that semantics are kept. Convert write_boundary_block() which already takes the buffer lock as well as bdev_getblk() depending on the respective gpf flags. There are no changes in semantics. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-4-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev # [0] [1] Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/buffer.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 5b1d74c818e9..f8c9e5eb4685 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -658,7 +658,9 @@ EXPORT_SYMBOL(generic_buffers_fsync); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize) { - struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + struct buffer_head *bh; + + bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) write_dirty_buffer(bh, 0); @@ -1440,7 +1442,12 @@ EXPORT_SYMBOL(__find_get_block_nonatomic); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __find_get_block(bdev, block, size); + struct buffer_head *bh; + + if (gfpflags_allow_blocking(gfp)) + bh = __find_get_block_nonatomic(bdev, block, size); + else + bh = __find_get_block(bdev, block, size); might_alloc(gfp); if (bh) -- cgit v1.2.3 From a0b5ff07491010789fcb012bc8f9dad9d26f9a8b Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:18 -0700 Subject: fs/ocfs2: use sleeping version of __find_get_block() This is a path that allows for blocking as it does IO. Convert to the new nonatomic flavor to benefit from potential performance benefits and adapt in the future vs migration such that semantics are kept. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-5-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/ocfs2/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f1b4b3e611cb..c7a9729dc9d0 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1249,7 +1249,7 @@ static int ocfs2_force_read_journal(struct inode *inode) } for (i = 0; i < p_blocks; i++, p_blkno++) { - bh = __find_get_block(osb->sb->s_bdev, p_blkno, + bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno, osb->sb->s_blocksize); /* block not cached. */ if (!bh) -- cgit v1.2.3 From f76d4c28a46a9260d85e00dafc8f46d369365d33 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:19 -0700 Subject: fs/jbd2: use sleeping version of __find_get_block() Convert to the new nonatomic flavor to benefit from potential performance benefits and adapt in the future vs migration such that semantics are kept. - jbd2_journal_revoke(): can sleep (has might_sleep() in the beginning) - jbd2_journal_cancel_revoke(): only used from do_get_write_access() and do_get_create_access() which do sleep. So can sleep. - jbd2_clear_buffer_revoked_flags() - only called from journal commit code which sleeps. So can sleep. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-6-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/jbd2/revoke.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 0cf0fddbee81..1467f6790747 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, bh = bh_in; if (!bh) { - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } @@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, /* If there is a different buffer_head lying around in * memory anywhere... */ - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh2 = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) @@ -464,7 +466,8 @@ void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, + bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); @@ -492,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); + bh = __find_get_block_nonatomic(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); -- cgit v1.2.3 From 6e8f57fd09c9fb569d10b2ccc3878155b702591a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:20 -0700 Subject: fs/ext4: use sleeping version of sb_find_get_block() Enable ext4_free_blocks() to use it, which has a cond_resched to begin with. Convert to the new nonatomic flavor to benefit from potential performance benefits and adapt in the future vs migration such that semantics are kept. Suggested-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-7-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/ext4/mballoc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f88424c28194..1e98c5be4e0a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -6642,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } -- cgit v1.2.3 From 2d900efff915fe24c3948d28eef9078953d87fec Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Thu, 17 Apr 2025 18:59:21 -0700 Subject: mm/migrate: fix sleep in atomic for large folios and buffer heads The large folio + buffer head noref migration scenarios are being naughty and blocking while holding a spinlock. As a consequence of the pagecache lookup path taking the folio lock this serializes against migration paths, so they can wait for each other. For the private_lock atomic case, a new BH_Migrate flag is introduced which enables the lookup to bail. This allows the critical region of the private_lock on the migration path to be reduced to the way it was before ebdf4de5642fb6 ("mm: migrate: fix reference check race between __find_get_block() and migration"), that is covering the count checks. The scope is always noref migration. Reported-by: kernel test robot Reported-by: syzbot+f3c6fda1297c748a7076@syzkaller.appspotmail.com Closes: https://lore.kernel.org/oe-lkp/202503101536.27099c77-lkp@intel.com Fixes: 3c20917120ce61 ("block/bdev: enable large folio support for large logical block sizes") Reviewed-by: Jan Kara Co-developed-by: Luis Chamberlain Signed-off-by: Davidlohr Bueso Link: https://kdevops.org/ext4/v6.15-rc2.html # [0] Link: https://lore.kernel.org/all/aAAEvcrmREWa1SKF@bombadil.infradead.org/ # [1] Link: https://lore.kernel.org/20250418015921.132400-8-dave@stgolabs.net Tested-by: kdevops@lists.linux.dev # [0] [1] Reviewed-by: Luis Chamberlain Signed-off-by: Christian Brauner --- fs/buffer.c | 12 +++++++++++- fs/ext4/ialloc.c | 3 ++- include/linux/buffer_head.h | 1 + mm/migrate.c | 8 +++++--- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index f8c9e5eb4685..7be23ff20b27 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -207,6 +207,15 @@ __find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) head = folio_buffers(folio); if (!head) goto out_unlock; + /* + * Upon a noref migration, the folio lock serializes here; + * otherwise bail. + */ + if (test_bit_acquire(BH_Migrate, &head->b_state)) { + WARN_ON(!atomic); + goto out_unlock; + } + bh = head; do { if (!buffer_mapped(bh)) @@ -1390,7 +1399,8 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) /* * Perform a pagecache lookup for the matching buffer. If it's there, refresh * it in the LRU and mark it as accessed. If it is not present then return - * NULL + * NULL. Atomic context callers may also return NULL if the buffer is being + * migrated; similarly the page is not marked accessed either. */ static struct buffer_head * find_get_block_common(struct block_device *bdev, sector_t block, diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 38bc8d74f4cc..e7ecc7c8a729 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it - * must have been written out. + * must have been written out, or, most unlikely, is + * being migrated - false failure should be OK here. */ goto out; diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index c791aa9a08da..0029ff880e27 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -34,6 +34,7 @@ enum bh_state_bits { BH_Meta, /* Buffer contains metadata */ BH_Prio, /* Buffer should be submitted with REQ_PRIO */ BH_Defer_Completion, /* Defer AIO completion to workqueue */ + BH_Migrate, /* Buffer is being migrated (norefs) */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities diff --git a/mm/migrate.c b/mm/migrate.c index f3ee6d8d5e2e..676d9cfc7059 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -845,9 +845,11 @@ static int __buffer_migrate_folio(struct address_space *mapping, return -EAGAIN; if (check_refs) { - bool busy; + bool busy, migrating; bool invalidated = false; + migrating = test_and_set_bit_lock(BH_Migrate, &head->b_state); + VM_WARN_ON_ONCE(migrating); recheck_buffers: busy = false; spin_lock(&mapping->i_private_lock); @@ -859,12 +861,12 @@ recheck_buffers: } bh = bh->b_this_page; } while (bh != head); + spin_unlock(&mapping->i_private_lock); if (busy) { if (invalidated) { rc = -EAGAIN; goto unlock_buffers; } - spin_unlock(&mapping->i_private_lock); invalidate_bh_lrus(); invalidated = true; goto recheck_buffers; @@ -883,7 +885,7 @@ recheck_buffers: unlock_buffers: if (check_refs) - spin_unlock(&mapping->i_private_lock); + clear_bit_unlock(BH_Migrate, &head->b_state); bh = head; do { unlock_buffer(bh); -- cgit v1.2.3 From d1f7256a5a525a44ac6a81d0a8ff931317b2acbf Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Fri, 18 Apr 2025 14:57:56 +0200 Subject: fs: fall back to file_ref_put() for non-last reference This reduces the slowdown in face of multiple callers issuing close on what turns out to not be the last reference. Signed-off-by: Mateusz Guzik Link: https://lore.kernel.org/20250418125756.59677-1-mjguzik@gmail.com Reviewed-by: Jan Kara Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202504171513.6d6f8a16-lkp@intel.com Signed-off-by: Christian Brauner --- fs/file.c | 2 +- include/linux/file_ref.h | 19 ++++++------------- 2 files changed, 7 insertions(+), 14 deletions(-) diff --git a/fs/file.c b/fs/file.c index dc3f7e120e3e..3a3146664cf3 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,7 +26,7 @@ #include "internal.h" -bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) { /* * If the reference count was already in the dead zone, then this diff --git a/include/linux/file_ref.h b/include/linux/file_ref.h index 7db62fbc0500..31551e4cb8f3 100644 --- a/include/linux/file_ref.h +++ b/include/linux/file_ref.h @@ -61,7 +61,6 @@ static inline void file_ref_init(file_ref_t *ref, unsigned long cnt) atomic_long_set(&ref->refcnt, cnt - 1); } -bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt); bool __file_ref_put(file_ref_t *ref, unsigned long cnt); /** @@ -178,20 +177,14 @@ static __always_inline __must_check bool file_ref_put(file_ref_t *ref) */ static __always_inline __must_check bool file_ref_put_close(file_ref_t *ref) { - long old, new; + long old; old = atomic_long_read(&ref->refcnt); - do { - if (unlikely(old < 0)) - return __file_ref_put_badval(ref, old); - - if (old == FILE_REF_ONEREF) - new = FILE_REF_DEAD; - else - new = old - 1; - } while (!atomic_long_try_cmpxchg(&ref->refcnt, &old, new)); - - return new == FILE_REF_DEAD; + if (likely(old == FILE_REF_ONEREF)) { + if (likely(atomic_long_try_cmpxchg(&ref->refcnt, &old, FILE_REF_DEAD))) + return true; + } + return file_ref_put(ref); } /** -- cgit v1.2.3 From 5cf3c602df88b471178a5717b17e529d09acad84 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Thu, 17 Apr 2025 10:23:15 -0400 Subject: drm/amdgpu: Use allowed_domains for pinning dmabufs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When determining the domains for pinning DMABufs, filter allowed_domains and fail with a warning if VRAM is forbidden and GTT is not an allowed domain. Fixes: f5e7fabd1f5c ("drm/amdgpu: allow pinning DMA-bufs into VRAM if all importers can do P2P") Suggested-by: Christian König Signed-off-by: Felix Kuehling Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 3940796a6eefa555fec688a4adee5659ef9fa431) --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 667080cc9ae1..0446586bd5a7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -77,7 +77,7 @@ static int amdgpu_dma_buf_pin(struct dma_buf_attachment *attach) { struct dma_buf *dmabuf = attach->dmabuf; struct amdgpu_bo *bo = gem_to_amdgpu_bo(dmabuf->priv); - u32 domains = bo->preferred_domains; + u32 domains = bo->allowed_domains; dma_resv_assert_held(dmabuf->resv); @@ -93,6 +93,9 @@ static int amdgpu_dma_buf_pin(struct dma_buf_attachment *attach) if (domains & AMDGPU_GEM_DOMAIN_VRAM) bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; + if (WARN_ON(!domains)) + return -EINVAL; + return amdgpu_bo_pin(bo, domains); } -- cgit v1.2.3 From 5e56935b519b2fbbca1cafa0cef3c7c3d062f62d Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Tue, 15 Apr 2025 23:58:28 -0400 Subject: drm/amdgpu: Don't pin VRAM without DMABUF_MOVE_NOTIFY MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pinning of VRAM is for peer devices that don't support dynamic attachment and move notifiers. But it requires that all such peer devices are able to access VRAM via PCIe P2P. Any device without P2P access requires migration to GTT, which fails if the memory is already pinned for another peer device. Sharing between GPUs should not require pinning in VRAM. However, if DMABUF_MOVE_NOTIFY is disabled in the kernel build, even DMABufs shared between GPUs must be pinned, which can lead to failures and functional regressions on systems where some peer GPUs are not P2P accessible. Disable VRAM pinning if move notifiers are disabled in the kernel build to fix regressions when sharing BOs between GPUs. Signed-off-by: Felix Kuehling Tested-by: Hao (Claire) Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 05185812ae3695fe049c14847ce3cbeccff1bf2e) --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 0446586bd5a7..5740e8d1a522 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -81,14 +81,21 @@ static int amdgpu_dma_buf_pin(struct dma_buf_attachment *attach) dma_resv_assert_held(dmabuf->resv); - /* - * Try pinning into VRAM to allow P2P with RDMA NICs without ODP + /* Try pinning into VRAM to allow P2P with RDMA NICs without ODP * support if all attachments can do P2P. If any attachment can't do * P2P just pin into GTT instead. + * + * To avoid with conflicting pinnings between GPUs and RDMA when move + * notifiers are disabled, only allow pinning in VRAM when move + * notiers are enabled. */ - list_for_each_entry(attach, &dmabuf->attachments, node) - if (!attach->peer2peer) - domains &= ~AMDGPU_GEM_DOMAIN_VRAM; + if (!IS_ENABLED(CONFIG_DMABUF_MOVE_NOTIFY)) { + domains &= ~AMDGPU_GEM_DOMAIN_VRAM; + } else { + list_for_each_entry(attach, &dmabuf->attachments, node) + if (!attach->peer2peer) + domains &= ~AMDGPU_GEM_DOMAIN_VRAM; + } if (domains & AMDGPU_GEM_DOMAIN_VRAM) bo->flags |= AMDGPU_GEM_CREATE_CPU_ACCESS_REQUIRED; -- cgit v1.2.3 From 7eb287beeb60be1e4437be2b4e4e9f0da89aab97 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Tue, 1 Apr 2025 17:05:10 -0400 Subject: drm/amd/display: Fix gpu reset in multidisplay config [Why] The indexing of stream_status in dm_gpureset_commit_state() is incorrect. That leads to asserts in multi-display configuration after gpu reset. [How] Adjust the indexing logic to align stream_status with surface_updates. Fixes: cdaae8371aa9 ("drm/amd/display: Handle GPU reset for DC block") Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3808 Reviewed-by: Aurabindo Pillai Reviewed-by: Mario Limonciello Signed-off-by: Roman Li Signed-off-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit d91bc901398741d317d9b55c59ca949d4bc7394b) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 9fed4471405f..8f3a778df646 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -3355,16 +3355,16 @@ static void dm_gpureset_commit_state(struct dc_state *dc_state, for (k = 0; k < dc_state->stream_count; k++) { bundle->stream_update.stream = dc_state->streams[k]; - for (m = 0; m < dc_state->stream_status->plane_count; m++) { + for (m = 0; m < dc_state->stream_status[k].plane_count; m++) { bundle->surface_updates[m].surface = - dc_state->stream_status->plane_states[m]; + dc_state->stream_status[k].plane_states[m]; bundle->surface_updates[m].surface->force_full_update = true; } update_planes_and_stream_adapter(dm->dc, UPDATE_TYPE_FULL, - dc_state->stream_status->plane_count, + dc_state->stream_status[k].plane_count, dc_state->streams[k], &bundle->stream_update, bundle->surface_updates); -- cgit v1.2.3 From 67fe574651c73fe5cc176e35f28f2ec1ba498d14 Mon Sep 17 00:00:00 2001 From: Roman Li Date: Wed, 26 Mar 2025 10:33:51 -0400 Subject: drm/amd/display: Force full update in gpu reset [Why] While system undergoing gpu reset always do full update to sync the dc state before and after reset. [How] Return true in should_reset_plane() if gpu reset detected Reviewed-by: Aurabindo Pillai Reviewed-by: Mario Limonciello Signed-off-by: Roman Li Signed-off-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit 2ba8619b9a378ad218ad6c2e2ccaee8f531e08de) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 8f3a778df646..61ee530d78ea 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -11043,6 +11043,9 @@ static bool should_reset_plane(struct drm_atomic_state *state, state->allow_modeset) return true; + if (amdgpu_in_reset(adev) && state->allow_modeset) + return true; + /* Exit early if we know that we're adding or removing the plane. */ if (old_plane_state->crtc != new_plane_state->crtc) return true; -- cgit v1.2.3 From 756c85e4d0ddc497b4ad5b1f41ad54e838e06188 Mon Sep 17 00:00:00 2001 From: Nicholas Susanto Date: Wed, 2 Apr 2025 15:04:08 -0400 Subject: drm/amd/display: Enable urgent latency adjustment on DCN35 [Why] Urgent latency adjustment was disabled on DCN35 due to issues with P0 enablement on some platforms. Without urgent latency, underflows occur when doing certain high timing configurations. After testing, we found that reenabling urgent latency didn't reintroduce p0 support on multiple platforms. [How] renable urgent latency on DCN35 and setting it to 3000 Mhz. This reverts commit 3412860cc4c0c484f53f91b371483e6e4440c3e5. Reviewed-by: Charlene Liu Signed-off-by: Nicholas Susanto Signed-off-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit cd74ce1f0cddffb3f36d0995d0f61e89f0010738) --- drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c index 92f0a099d089..d9159ca55412 100644 --- a/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c +++ b/drivers/gpu/drm/amd/display/dc/dml/dcn35/dcn35_fpu.c @@ -195,9 +195,9 @@ struct _vcs_dpi_soc_bounding_box_st dcn3_5_soc = { .dcn_downspread_percent = 0.5, .gpuvm_min_page_size_bytes = 4096, .hostvm_min_page_size_bytes = 4096, - .do_urgent_latency_adjustment = 0, + .do_urgent_latency_adjustment = 1, .urgent_latency_adjustment_fabric_clock_component_us = 0, - .urgent_latency_adjustment_fabric_clock_reference_mhz = 0, + .urgent_latency_adjustment_fabric_clock_reference_mhz = 3000, }; void dcn35_build_wm_range_table_fpu(struct clk_mgr *clk_mgr) -- cgit v1.2.3 From a92741e72f91b904c1d8c3d409ed8dbe9c1f2b26 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 16 Apr 2025 00:19:13 -0400 Subject: drm/amdgpu: Allow P2P access through XGMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If peer memory is accessible through XGMI, allow leaving it in VRAM rather than forcing its migration to GTT on DMABuf attachment. Signed-off-by: Felix Kuehling Tested-by: Hao (Claire) Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 372c8d72c3680fdea3fbb2d6b089f76b4a6d596a) --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 30 ++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index 5740e8d1a522..e6913fcf2c7b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -43,6 +43,29 @@ #include #include +static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops; + +/** + * dma_buf_attach_adev - Helper to get adev of an attachment + * + * @attach: attachment + * + * Returns: + * A struct amdgpu_device * if the attaching device is an amdgpu device or + * partition, NULL otherwise. + */ +static struct amdgpu_device *dma_buf_attach_adev(struct dma_buf_attachment *attach) +{ + if (attach->importer_ops == &amdgpu_dma_buf_attach_ops) { + struct drm_gem_object *obj = attach->importer_priv; + struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); + + return amdgpu_ttm_adev(bo->tbo.bdev); + } + + return NULL; +} + /** * amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation * @@ -54,11 +77,13 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) { + struct amdgpu_device *attach_adev = dma_buf_attach_adev(attach); struct drm_gem_object *obj = dmabuf->priv; struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); - if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) + if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && + pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; amdgpu_vm_bo_update_shared(bo); @@ -480,6 +505,9 @@ bool amdgpu_dmabuf_is_xgmi_accessible(struct amdgpu_device *adev, struct drm_gem_object *obj = &bo->tbo.base; struct drm_gem_object *gobj; + if (!adev) + return false; + if (obj->import_attach) { struct dma_buf *dma_buf = obj->import_attach->dmabuf; -- cgit v1.2.3 From 870bea21fdf88f45c94c0a3dbb0e3cc1b219680f Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Fri, 4 Apr 2025 09:34:52 -0500 Subject: drm/amd/display: Fix ACPI edid parsing on some Lenovo systems [Why] The ACPI EDID in the BIOS of a Lenovo laptop includes 3 blocks, but dm_helpers_probe_acpi_edid() has a start that is 'char'. The 3rd block index starts after 255, so it can't be indexed properly. This leads to problems with the display when the EDID is parsed. [How] Change the variable type to 'short' so that larger values can be indexed. Cc: Renjith Pananchikkal Reported-by: Mark Pearson Suggested-by: David Ober Fixes: c6a837088bed ("drm/amd/display: Fetch the EDID from _DDC if available for eDP") Reviewed-by: Alex Hung Signed-off-by: Mario Limonciello Signed-off-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit a918bb4a90d423ced2976a794f2724c362c1f063) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c index 2cd35392e2da..1395a748d726 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c @@ -918,7 +918,7 @@ dm_helpers_probe_acpi_edid(void *data, u8 *buf, unsigned int block, size_t len) { struct drm_connector *connector = data; struct acpi_device *acpidev = ACPI_COMPANION(connector->dev->dev); - unsigned char start = block * EDID_LENGTH; + unsigned short start = block * EDID_LENGTH; struct edid *edid; int r; -- cgit v1.2.3 From d59bddce49bfd323f1218bb6c3ad314e5c4e8f9d Mon Sep 17 00:00:00 2001 From: George Shen Date: Mon, 7 Apr 2025 12:35:57 -0400 Subject: drm/amd/display: Use 16ms AUX read interval for LTTPR with old sinks [Why/How] LTTPR are required to program DPCD 0000Eh to 0x4 (16ms) upon AUX read reply to this register. Since old Sinks witih DPCD rev 1.1 and earlier may not support this register, assume the mandatory value is programmed by the LTTPR to avoid AUX timeout issues. Reviewed-by: Wenjing Liu Signed-off-by: George Shen Signed-off-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit 1594b60d74959c0680ddf777a74963c98afcdd7e) --- .../dc/link/protocols/link_dp_training_8b_10b.c | 54 ++++++++++++++-------- 1 file changed, 36 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c index 34d2e097ca2e..5a5d48fadbf2 100644 --- a/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c +++ b/drivers/gpu/drm/amd/display/dc/link/protocols/link_dp_training_8b_10b.c @@ -35,6 +35,17 @@ #define DC_LOGGER \ link->ctx->logger +static void get_default_8b_10b_lttpr_aux_rd_interval( + union training_aux_rd_interval *training_rd_interval) +{ + /* LTTPR are required to program DPCD 0000Eh to 0x4 (16ms) upon AUX + * read reply to this register. Since old sinks with DPCD rev 1.1 + * and earlier may not support this register, assume the mandatory + * value is programmed by the LTTPR to avoid AUX timeout issues. + */ + training_rd_interval->raw = 0x4; +} + static int32_t get_cr_training_aux_rd_interval(struct dc_link *link, const struct dc_link_settings *link_settings, enum lttpr_mode lttpr_mode) @@ -43,17 +54,22 @@ static int32_t get_cr_training_aux_rd_interval(struct dc_link *link, uint32_t wait_in_micro_secs = 100; memset(&training_rd_interval, 0, sizeof(training_rd_interval)); - if (link_dp_get_encoding_format(link_settings) == DP_8b_10b_ENCODING && - link->dpcd_caps.dpcd_rev.raw >= DPCD_REV_12) { - core_link_read_dpcd( - link, - DP_TRAINING_AUX_RD_INTERVAL, - (uint8_t *)&training_rd_interval, - sizeof(training_rd_interval)); - if (lttpr_mode != LTTPR_MODE_NON_TRANSPARENT) - wait_in_micro_secs = 400; - if (training_rd_interval.bits.TRAINIG_AUX_RD_INTERVAL) - wait_in_micro_secs = training_rd_interval.bits.TRAINIG_AUX_RD_INTERVAL * 4000; + if (link_dp_get_encoding_format(link_settings) == DP_8b_10b_ENCODING) { + if (link->dpcd_caps.dpcd_rev.raw >= DPCD_REV_12) + core_link_read_dpcd( + link, + DP_TRAINING_AUX_RD_INTERVAL, + (uint8_t *)&training_rd_interval, + sizeof(training_rd_interval)); + else if (dp_is_lttpr_present(link)) + get_default_8b_10b_lttpr_aux_rd_interval(&training_rd_interval); + + if (training_rd_interval.raw != 0) { + if (lttpr_mode != LTTPR_MODE_NON_TRANSPARENT) + wait_in_micro_secs = 400; + if (training_rd_interval.bits.TRAINIG_AUX_RD_INTERVAL) + wait_in_micro_secs = training_rd_interval.bits.TRAINIG_AUX_RD_INTERVAL * 4000; + } } return wait_in_micro_secs; } @@ -71,13 +87,15 @@ static uint32_t get_eq_training_aux_rd_interval( DP_128B132B_TRAINING_AUX_RD_INTERVAL, (uint8_t *)&training_rd_interval, sizeof(training_rd_interval)); - } else if (link_dp_get_encoding_format(link_settings) == DP_8b_10b_ENCODING && - link->dpcd_caps.dpcd_rev.raw >= DPCD_REV_12) { - core_link_read_dpcd( - link, - DP_TRAINING_AUX_RD_INTERVAL, - (uint8_t *)&training_rd_interval, - sizeof(training_rd_interval)); + } else if (link_dp_get_encoding_format(link_settings) == DP_8b_10b_ENCODING) { + if (link->dpcd_caps.dpcd_rev.raw >= DPCD_REV_12) + core_link_read_dpcd( + link, + DP_TRAINING_AUX_RD_INTERVAL, + (uint8_t *)&training_rd_interval, + sizeof(training_rd_interval)); + else if (dp_is_lttpr_present(link)) + get_default_8b_10b_lttpr_aux_rd_interval(&training_rd_interval); } switch (training_rd_interval.bits.TRAINIG_AUX_RD_INTERVAL) { -- cgit v1.2.3 From 6ed0dc3fd39558f48119daf8f99f835deb7d68da Mon Sep 17 00:00:00 2001 From: Leo Li Date: Tue, 18 Mar 2025 18:05:05 -0400 Subject: drm/amd/display: Default IPS to RCG_IN_ACTIVE_IPS2_IN_OFF [Why] Recent findings show negligible power savings between IPS2 and RCG during static desktop. In fact, DCN related clocks are higher when IPS2 is enabled vs RCG. RCG_IN_ACTIVE is also the default policy for another OS supported by DC, and it has faster entry/exit. [How] Remove previous logic that checked for IPS2 support, and just default to `DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF`. Fixes: 199888aa25b3 ("drm/amd/display: Update IPS default mode for DCN35/DCN351") Reviewed-by: Aurabindo Pillai Signed-off-by: Leo Li Signed-off-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit 8f772d79ef39b463ead00ef6f009bebada3a9d49) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 61ee530d78ea..5fe0b4921568 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -1920,26 +1920,6 @@ static enum dmub_ips_disable_type dm_get_default_ips_mode( switch (amdgpu_ip_version(adev, DCE_HWIP, 0)) { case IP_VERSION(3, 5, 0): case IP_VERSION(3, 6, 0): - /* - * On DCN35 systems with Z8 enabled, it's possible for IPS2 + Z8 to - * cause a hard hang. A fix exists for newer PMFW. - * - * As a workaround, for non-fixed PMFW, force IPS1+RCG as the deepest - * IPS state in all cases, except for s0ix and all displays off (DPMS), - * where IPS2 is allowed. - * - * When checking pmfw version, use the major and minor only. - */ - if ((adev->pm.fw_version & 0x00FFFF00) < 0x005D6300) - ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; - else if (amdgpu_ip_version(adev, GC_HWIP, 0) > IP_VERSION(11, 5, 0)) - /* - * Other ASICs with DCN35 that have residency issues with - * IPS2 in idle. - * We want them to use IPS2 only in display off cases. - */ - ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; - break; case IP_VERSION(3, 5, 1): ret = DMUB_IPS_RCG_IN_ACTIVE_IPS2_IN_OFF; break; -- cgit v1.2.3 From b316727a27d0dac1e6b7ae51204df4d0f241fcc2 Mon Sep 17 00:00:00 2001 From: Gergo Koteles Date: Wed, 2 Apr 2025 19:03:31 +0200 Subject: drm/amd/display: do not copy invalid CRTC timing info Since b255ce4388e0, it is possible that the CRTC timing information for the preferred mode has not yet been calculated while amdgpu_dm_connector_mode_valid() is running. In this case use the CRTC timing information of the actual mode. Fixes: b255ce4388e0 ("drm/amdgpu: don't change mode in amdgpu_dm_connector_mode_valid()") Closes: https://lore.kernel.org/all/ed09edb167e74167a694f4854102a3de6d2f1433.camel@irl.hu/ Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4085 Signed-off-by: Gergo Koteles Reviewed-by: Zaeem Mohamed Tested-by: Mark Broadworth Signed-off-by: Alex Deucher (cherry picked from commit 20232192a5044d1f66dcbef0a24de1bb8157738d) Cc: stable@vger.kernel.org --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 5fe0b4921568..536f73131c2d 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -6501,12 +6501,12 @@ decide_crtc_timing_for_drm_display_mode(struct drm_display_mode *drm_mode, const struct drm_display_mode *native_mode, bool scale_enabled) { - if (scale_enabled) { - copy_crtc_timing_for_drm_display_mode(native_mode, drm_mode); - } else if (native_mode->clock == drm_mode->clock && - native_mode->htotal == drm_mode->htotal && - native_mode->vtotal == drm_mode->vtotal) { - copy_crtc_timing_for_drm_display_mode(native_mode, drm_mode); + if (scale_enabled || ( + native_mode->clock == drm_mode->clock && + native_mode->htotal == drm_mode->htotal && + native_mode->vtotal == drm_mode->vtotal)) { + if (native_mode->crtc_clock) + copy_crtc_timing_for_drm_display_mode(native_mode, drm_mode); } else { /* no scaling nor amdgpu inserted, no need to patch */ } -- cgit v1.2.3 From 4c8925cb9db158c812e1e11f3e74b945df7c9801 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Apr 2025 17:16:01 +0100 Subject: net: phylink: fix suspend/resume with WoL enabled and link down When WoL is enabled, we update the software state in phylink to indicate that the link is down, and disable the resolver from bringing the link back up. On resume, we attempt to bring the overall state into consistency by calling the .mac_link_down() method, but this is wrong if the link was already down, as phylink strictly orders the .mac_link_up() and .mac_link_down() methods - and this would break that ordering. Fixes: f97493657c63 ("net: phylink: add suspend/resume support") Signed-off-by: Russell King (Oracle) Tested-by: Russell King (Oracle) Link: https://patch.msgid.link/E1u55Qf-0016RN-PA@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- drivers/net/phy/phylink.c | 38 ++++++++++++++++++++++---------------- 1 file changed, 22 insertions(+), 16 deletions(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index b68369e2342b..1bdd5d8bb5b0 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -81,6 +81,7 @@ struct phylink { unsigned int pcs_state; bool link_failed; + bool suspend_link_up; bool major_config_failed; bool mac_supports_eee_ops; bool mac_supports_eee; @@ -2545,14 +2546,16 @@ void phylink_suspend(struct phylink *pl, bool mac_wol) /* Stop the resolver bringing the link up */ __set_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state); - /* Disable the carrier, to prevent transmit timeouts, - * but one would hope all packets have been sent. This - * also means phylink_resolve() will do nothing. - */ - if (pl->netdev) - netif_carrier_off(pl->netdev); - else + pl->suspend_link_up = phylink_link_is_up(pl); + if (pl->suspend_link_up) { + /* Disable the carrier, to prevent transmit timeouts, + * but one would hope all packets have been sent. This + * also means phylink_resolve() will do nothing. + */ + if (pl->netdev) + netif_carrier_off(pl->netdev); pl->old_link_state = false; + } /* We do not call mac_link_down() here as we want the * link to remain up to receive the WoL packets. @@ -2603,15 +2606,18 @@ void phylink_resume(struct phylink *pl) if (test_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state)) { /* Wake-on-Lan enabled, MAC handling */ - /* Call mac_link_down() so we keep the overall state balanced. - * Do this under the state_mutex lock for consistency. This - * will cause a "Link Down" message to be printed during - * resume, which is harmless - the true link state will be - * printed when we run a resolve. - */ - mutex_lock(&pl->state_mutex); - phylink_link_down(pl); - mutex_unlock(&pl->state_mutex); + if (pl->suspend_link_up) { + /* Call mac_link_down() so we keep the overall state + * balanced. Do this under the state_mutex lock for + * consistency. This will cause a "Link Down" message + * to be printed during resume, which is harmless - + * the true link state will be printed when we run a + * resolve. + */ + mutex_lock(&pl->state_mutex); + phylink_link_down(pl); + mutex_unlock(&pl->state_mutex); + } /* Re-apply the link parameters so that all the settings get * restored to the MAC. -- cgit v1.2.3 From ce6815585d460c610e9881a5d347c0a34da287e4 Mon Sep 17 00:00:00 2001 From: "Russell King (Oracle)" Date: Wed, 16 Apr 2025 22:53:19 +0100 Subject: net: phylink: mac_link_(up|down)() clarifications As a result of an email from the fbnic author, I reviewed the phylink documentation, and I have decided to clarify the wording in the mac_link_(up|down)() kernel documentation as this was written from the point of view of mvneta/mvpp2 and is misleading. The documentation talks about forcing the link - indeed, this is what is done in the mvneta and mvpp2 drivers but not at the physical layer but the MACs idea, which has the effect of only allowing or stopping packet flow at the MAC. This "link" needs to be controlled when using a PHY or fixed link to start or stop packet flow at the MAC. However, as the MAC and PCS are tightly integrated, if the MACs idea of the link is forced down, it has the side effect that there is no way to determine that the media link has come up - in this mode, the MAC must be allowed to follow its built-in PCS so we can read the link state. Frame the documentation in more generic terms, to avoid the thought that the physical media link to the partner needs in some way to be forced up or down with these calls; it does not. If that were to be done, it would be a self-fulfilling prophecy - e.g. if the media link goes down, then mac_link_down() will be called, and if the media link is then placed into a forced down state, there is no possibility that the media link will ever come up again - clearly this is a wrong interpretation. These methods are notifications to the MAC about what has happened to the media link state - either from the PHY, or a PCS, or whatever mechanism fixed-link is using. Thus, reword them to get away from talking about changing link state to avoid confusion with media link state. This is not a change of any requirements of these methods. Also, remove the obsolete references to EEE for these methods, we now have the LPI functions for configuring the EEE parameters which renders this redundant, and also makes the passing of "phy" to the mac_link_up() function obsolete. Signed-off-by: Russell King (Oracle) Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/E1u5Ah5-001GO1-7E@rmk-PC.armlinux.org.uk Signed-off-by: Jakub Kicinski --- include/linux/phylink.h | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/include/linux/phylink.h b/include/linux/phylink.h index 1f5773ab5660..30659b615fca 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -361,23 +361,29 @@ int mac_finish(struct phylink_config *config, unsigned int mode, phy_interface_t iface); /** - * mac_link_down() - take the link down + * mac_link_down() - notification that the link has gone down * @config: a pointer to a &struct phylink_config. * @mode: link autonegotiation mode * @interface: link &typedef phy_interface_t mode * - * If @mode is not an in-band negotiation mode (as defined by - * phylink_autoneg_inband()), force the link down and disable any - * Energy Efficient Ethernet MAC configuration. Interface type - * selection must be done in mac_config(). + * Notifies the MAC that the link has gone down. This will not be called + * unless mac_link_up() has been previously called. + * + * The MAC should stop processing packets for transmission and reception. + * phylink will have called netif_carrier_off() to notify the networking + * stack that the link has gone down, so MAC drivers should not make this + * call. + * + * If @mode is %MLO_AN_INBAND, then this function must not prevent the + * link coming up. */ void mac_link_down(struct phylink_config *config, unsigned int mode, phy_interface_t interface); /** - * mac_link_up() - allow the link to come up + * mac_link_up() - notification that the link has come up * @config: a pointer to a &struct phylink_config. - * @phy: any attached phy + * @phy: any attached phy (deprecated - please use LPI interfaces) * @mode: link autonegotiation mode * @interface: link &typedef phy_interface_t mode * @speed: link speed @@ -385,7 +391,10 @@ void mac_link_down(struct phylink_config *config, unsigned int mode, * @tx_pause: link transmit pause enablement status * @rx_pause: link receive pause enablement status * - * Configure the MAC for an established link. + * Notifies the MAC that the link has come up, and the parameters of the + * link as seen from the MACs point of view. If mac_link_up() has been + * called previously, there will be an intervening call to mac_link_down() + * before this method will be subsequently called. * * @speed, @duplex, @tx_pause and @rx_pause indicate the finalised link * settings, and should be used to configure the MAC block appropriately @@ -397,9 +406,9 @@ void mac_link_down(struct phylink_config *config, unsigned int mode, * that the user wishes to override the pause settings, and this should * be allowed when considering the implementation of this method. * - * If in-band negotiation mode is disabled, allow the link to come up. If - * @phy is non-%NULL, configure Energy Efficient Ethernet by calling - * phy_init_eee() and perform appropriate MAC configuration for EEE. + * Once configured, the MAC may begin to process packets for transmission + * and reception. + * * Interface type selection must be done in mac_config(). */ void mac_link_up(struct phylink_config *config, struct phy_device *phy, -- cgit v1.2.3 From b7f0ee992adf601aa00c252418266177eb7ac2bc Mon Sep 17 00:00:00 2001 From: Qingfang Deng Date: Thu, 17 Apr 2025 11:25:56 +0800 Subject: net: phy: leds: fix memory leak A network restart test on a router led to an out-of-memory condition, which was traced to a memory leak in the PHY LED trigger code. The root cause is misuse of the devm API. The registration function (phy_led_triggers_register) is called from phy_attach_direct, not phy_probe, and the unregister function (phy_led_triggers_unregister) is called from phy_detach, not phy_remove. This means the register and unregister functions can be called multiple times for the same PHY device, but devm-allocated memory is not freed until the driver is unbound. This also prevents kmemleak from detecting the leak, as the devm API internally stores the allocated pointer. Fix this by replacing devm_kzalloc/devm_kcalloc with standard kzalloc/kcalloc, and add the corresponding kfree calls in the unregister path. Fixes: 3928ee6485a3 ("net: phy: leds: Add support for "link" trigger") Fixes: 2e0bc452f472 ("net: phy: leds: add support for led triggers on phy link state change") Signed-off-by: Hao Guan Signed-off-by: Qingfang Deng Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20250417032557.2929427-1-dqfext@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/phy_led_triggers.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/drivers/net/phy/phy_led_triggers.c b/drivers/net/phy/phy_led_triggers.c index bd3c9554f6ac..60893691d4c3 100644 --- a/drivers/net/phy/phy_led_triggers.c +++ b/drivers/net/phy/phy_led_triggers.c @@ -93,9 +93,8 @@ int phy_led_triggers_register(struct phy_device *phy) if (!phy->phy_num_led_triggers) return 0; - phy->led_link_trigger = devm_kzalloc(&phy->mdio.dev, - sizeof(*phy->led_link_trigger), - GFP_KERNEL); + phy->led_link_trigger = kzalloc(sizeof(*phy->led_link_trigger), + GFP_KERNEL); if (!phy->led_link_trigger) { err = -ENOMEM; goto out_clear; @@ -105,10 +104,9 @@ int phy_led_triggers_register(struct phy_device *phy) if (err) goto out_free_link; - phy->phy_led_triggers = devm_kcalloc(&phy->mdio.dev, - phy->phy_num_led_triggers, - sizeof(struct phy_led_trigger), - GFP_KERNEL); + phy->phy_led_triggers = kcalloc(phy->phy_num_led_triggers, + sizeof(struct phy_led_trigger), + GFP_KERNEL); if (!phy->phy_led_triggers) { err = -ENOMEM; goto out_unreg_link; @@ -129,11 +127,11 @@ int phy_led_triggers_register(struct phy_device *phy) out_unreg: while (i--) phy_led_trigger_unregister(&phy->phy_led_triggers[i]); - devm_kfree(&phy->mdio.dev, phy->phy_led_triggers); + kfree(phy->phy_led_triggers); out_unreg_link: phy_led_trigger_unregister(phy->led_link_trigger); out_free_link: - devm_kfree(&phy->mdio.dev, phy->led_link_trigger); + kfree(phy->led_link_trigger); phy->led_link_trigger = NULL; out_clear: phy->phy_num_led_triggers = 0; @@ -147,8 +145,13 @@ void phy_led_triggers_unregister(struct phy_device *phy) for (i = 0; i < phy->phy_num_led_triggers; i++) phy_led_trigger_unregister(&phy->phy_led_triggers[i]); + kfree(phy->phy_led_triggers); + phy->phy_led_triggers = NULL; - if (phy->led_link_trigger) + if (phy->led_link_trigger) { phy_led_trigger_unregister(phy->led_link_trigger); + kfree(phy->led_link_trigger); + phy->led_link_trigger = NULL; + } } EXPORT_SYMBOL_GPL(phy_led_triggers_unregister); -- cgit v1.2.3 From 4bc12818b363bd30f0f7348dd9ab077290a637ae Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Thu, 17 Apr 2025 14:28:03 +0700 Subject: virtio-net: disable delayed refill when pausing rx When pausing rx (e.g. set up xdp, xsk pool, rx resize), we call napi_disable() on the receive queue's napi. In delayed refill_work, it also calls napi_disable() on the receive queue's napi. When napi_disable() is called on an already disabled napi, it will sleep in napi_disable_locked while still holding the netdev_lock. As a result, later napi_enable gets stuck too as it cannot acquire the netdev_lock. This leads to refill_work and the pause-then-resume tx are stuck altogether. This scenario can be reproducible by binding a XDP socket to virtio-net interface without setting up the fill ring. As a result, try_fill_recv will fail until the fill ring is set up and refill_work is scheduled. This commit adds virtnet_rx_(pause/resume)_all helpers and fixes up the virtnet_rx_resume to disable future and cancel all inflights delayed refill_work before calling napi_disable() to pause the rx. Fixes: 413f0271f396 ("net: protect NAPI enablement with netdev_lock()") Acked-by: Michael S. Tsirkin Signed-off-by: Bui Quang Minh Acked-by: Jason Wang Link: https://patch.msgid.link/20250417072806.18660-2-minhquangbui99@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/virtio_net.c | 69 +++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 57 insertions(+), 12 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 7e4617216a4b..848fab51dfa1 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -3342,7 +3342,8 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) return NETDEV_TX_OK; } -static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) +static void __virtnet_rx_pause(struct virtnet_info *vi, + struct receive_queue *rq) { bool running = netif_running(vi->dev); @@ -3352,17 +3353,63 @@ static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) } } -static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) +static void virtnet_rx_pause_all(struct virtnet_info *vi) +{ + int i; + + /* + * Make sure refill_work does not run concurrently to + * avoid napi_disable race which leads to deadlock. + */ + disable_delayed_refill(vi); + cancel_delayed_work_sync(&vi->refill); + for (i = 0; i < vi->max_queue_pairs; i++) + __virtnet_rx_pause(vi, &vi->rq[i]); +} + +static void virtnet_rx_pause(struct virtnet_info *vi, struct receive_queue *rq) +{ + /* + * Make sure refill_work does not run concurrently to + * avoid napi_disable race which leads to deadlock. + */ + disable_delayed_refill(vi); + cancel_delayed_work_sync(&vi->refill); + __virtnet_rx_pause(vi, rq); +} + +static void __virtnet_rx_resume(struct virtnet_info *vi, + struct receive_queue *rq, + bool refill) { bool running = netif_running(vi->dev); - if (!try_fill_recv(vi, rq, GFP_KERNEL)) + if (refill && !try_fill_recv(vi, rq, GFP_KERNEL)) schedule_delayed_work(&vi->refill, 0); if (running) virtnet_napi_enable(rq); } +static void virtnet_rx_resume_all(struct virtnet_info *vi) +{ + int i; + + enable_delayed_refill(vi); + for (i = 0; i < vi->max_queue_pairs; i++) { + if (i < vi->curr_queue_pairs) + __virtnet_rx_resume(vi, &vi->rq[i], true); + else + __virtnet_rx_resume(vi, &vi->rq[i], false); + } +} + +static void virtnet_rx_resume(struct virtnet_info *vi, struct receive_queue *rq) +{ + enable_delayed_refill(vi); + __virtnet_rx_resume(vi, rq, true); +} + static int virtnet_rx_resize(struct virtnet_info *vi, struct receive_queue *rq, u32 ring_num) { @@ -5959,12 +6006,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, if (prog) bpf_prog_add(prog, vi->max_queue_pairs - 1); + virtnet_rx_pause_all(vi); + /* Make sure NAPI is not using any XDP TX queues for RX. */ if (netif_running(dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - virtnet_napi_disable(&vi->rq[i]); + for (i = 0; i < vi->max_queue_pairs; i++) virtnet_napi_tx_disable(&vi->sq[i]); - } } if (!prog) { @@ -5996,13 +6043,12 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, vi->xdp_enabled = false; } + virtnet_rx_resume_all(vi); for (i = 0; i < vi->max_queue_pairs; i++) { if (old_prog) bpf_prog_put(old_prog); - if (netif_running(dev)) { - virtnet_napi_enable(&vi->rq[i]); + if (netif_running(dev)) virtnet_napi_tx_enable(&vi->sq[i]); - } } return 0; @@ -6014,11 +6060,10 @@ err: rcu_assign_pointer(vi->rq[i].xdp_prog, old_prog); } + virtnet_rx_resume_all(vi); if (netif_running(dev)) { - for (i = 0; i < vi->max_queue_pairs; i++) { - virtnet_napi_enable(&vi->rq[i]); + for (i = 0; i < vi->max_queue_pairs; i++) virtnet_napi_tx_enable(&vi->sq[i]); - } } if (prog) bpf_prog_sub(prog, vi->max_queue_pairs - 1); -- cgit v1.2.3 From 002ba346e3d76bb2b09448beed06c5ea1b0e06b8 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 21 Apr 2025 11:31:31 +0800 Subject: crypto: scomp - Fix off-by-one bug when calculating last page Fix off-by-one bug in the last page calculation for src and dst. Reported-by: Nhat Pham Fixes: 2d3553ecb4e3 ("crypto: scomp - Remove support for some non-trivial SG lists") Signed-off-by: Herbert Xu --- crypto/scompress.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/crypto/scompress.c b/crypto/scompress.c index 5762fcc63b51..36934c78d127 100644 --- a/crypto/scompress.c +++ b/crypto/scompress.c @@ -215,8 +215,8 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) spage = nth_page(spage, soff / PAGE_SIZE); soff = offset_in_page(soff); - n = slen / PAGE_SIZE; - n += (offset_in_page(slen) + soff - 1) / PAGE_SIZE; + n = (slen - 1) / PAGE_SIZE; + n += (offset_in_page(slen - 1) + soff) / PAGE_SIZE; if (PageHighMem(nth_page(spage, n)) && size_add(soff, slen) > PAGE_SIZE) break; @@ -243,9 +243,9 @@ static int scomp_acomp_comp_decomp(struct acomp_req *req, int dir) dpage = nth_page(dpage, doff / PAGE_SIZE); doff = offset_in_page(doff); - n = dlen / PAGE_SIZE; - n += (offset_in_page(dlen) + doff - 1) / PAGE_SIZE; - if (PageHighMem(dpage + n) && + n = (dlen - 1) / PAGE_SIZE; + n += (offset_in_page(dlen - 1) + doff) / PAGE_SIZE; + if (PageHighMem(nth_page(dpage, n)) && size_add(doff, dlen) > PAGE_SIZE) break; dst = kmap_local_page(dpage) + doff; -- cgit v1.2.3 From 8006aff15516a170640239c5a8e6696c0ba18d8e Mon Sep 17 00:00:00 2001 From: Marek Behún Date: Tue, 22 Apr 2025 11:57:18 +0200 Subject: crypto: atmel-sha204a - Set hwrng quality to lowest possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit According to the review by Bill Cox [1], the Atmel SHA204A random number generator produces random numbers with very low entropy. Set the lowest possible entropy for this chip just to be safe. [1] https://www.metzdowd.com/pipermail/cryptography/2014-December/023858.html Fixes: da001fb651b00e1d ("crypto: atmel-i2c - add support for SHA204A random number generator") Cc: Signed-off-by: Marek Behún Acked-by: Ard Biesheuvel Reviewed-by: Linus Walleij Signed-off-by: Herbert Xu --- drivers/crypto/atmel-sha204a.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/crypto/atmel-sha204a.c b/drivers/crypto/atmel-sha204a.c index 75bebec2c757..0fcf4a39de27 100644 --- a/drivers/crypto/atmel-sha204a.c +++ b/drivers/crypto/atmel-sha204a.c @@ -163,6 +163,12 @@ static int atmel_sha204a_probe(struct i2c_client *client) i2c_priv->hwrng.name = dev_name(&client->dev); i2c_priv->hwrng.read = atmel_sha204a_rng_read; + /* + * According to review by Bill Cox [1], this HWRNG has very low entropy. + * [1] https://www.metzdowd.com/pipermail/cryptography/2014-December/023858.html + */ + i2c_priv->hwrng.quality = 1; + ret = devm_hwrng_register(&client->dev, &i2c_priv->hwrng); if (ret) dev_warn(&client->dev, "failed to register RNG (%d)\n", ret); -- cgit v1.2.3 From d63527e109e811ef11abb1c2985048fdb528b4cb Mon Sep 17 00:00:00 2001 From: Tung Nguyen Date: Thu, 17 Apr 2025 14:47:15 +0700 Subject: tipc: fix NULL pointer dereference in tipc_mon_reinit_self() syzbot reported: tipc: Node number set to 1055423674 Oops: general protection fault, probably for non-canonical address 0xdffffc0000000000: 0000 [#1] SMP KASAN NOPTI KASAN: null-ptr-deref in range [0x0000000000000000-0x0000000000000007] CPU: 3 UID: 0 PID: 6017 Comm: kworker/3:5 Not tainted 6.15.0-rc1-syzkaller-00246-g900241a5cc15 #0 PREEMPT(full) Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-debian-1.16.3-2~bpo12+1 04/01/2014 Workqueue: events tipc_net_finalize_work RIP: 0010:tipc_mon_reinit_self+0x11c/0x210 net/tipc/monitor.c:719 ... RSP: 0018:ffffc9000356fb68 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000003ee87cba RDX: 0000000000000000 RSI: ffffffff8dbc56a7 RDI: ffff88804c2cc010 RBP: dffffc0000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000007 R13: fffffbfff2111097 R14: ffff88804ead8000 R15: ffff88804ead9010 FS: 0000000000000000(0000) GS:ffff888097ab9000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000f720eb00 CR3: 000000000e182000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: tipc_net_finalize+0x10b/0x180 net/tipc/net.c:140 process_one_work+0x9cc/0x1b70 kernel/workqueue.c:3238 process_scheduled_works kernel/workqueue.c:3319 [inline] worker_thread+0x6c8/0xf10 kernel/workqueue.c:3400 kthread+0x3c2/0x780 kernel/kthread.c:464 ret_from_fork+0x45/0x80 arch/x86/kernel/process.c:153 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:245 ... RIP: 0010:tipc_mon_reinit_self+0x11c/0x210 net/tipc/monitor.c:719 ... RSP: 0018:ffffc9000356fb68 EFLAGS: 00010246 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 000000003ee87cba RDX: 0000000000000000 RSI: ffffffff8dbc56a7 RDI: ffff88804c2cc010 RBP: dffffc0000000000 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000007 R13: fffffbfff2111097 R14: ffff88804ead8000 R15: ffff88804ead9010 FS: 0000000000000000(0000) GS:ffff888097ab9000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000f720eb00 CR3: 000000000e182000 CR4: 0000000000352ef0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 There is a racing condition between workqueue created when enabling bearer and another thread created when disabling bearer right after that as follow: enabling_bearer | disabling_bearer --------------- | ---------------- tipc_disc_timeout() | { | bearer_disable() ... | { schedule_work(&tn->work); | tipc_mon_delete() ... | { } | ... | write_lock_bh(&mon->lock); | mon->self = NULL; | write_unlock_bh(&mon->lock); | ... | } tipc_net_finalize_work() | } { | ... | tipc_net_finalize() | { | ... | tipc_mon_reinit_self() | { | ... | write_lock_bh(&mon->lock); | mon->self->addr = tipc_own_addr(net); | write_unlock_bh(&mon->lock); | ... | } | ... | } | ... | } | 'mon->self' is set to NULL in disabling_bearer thread and dereferenced later in enabling_bearer thread. This commit fixes this issue by validating 'mon->self' before assigning node address to it. Reported-by: syzbot+ed60da8d686dc709164c@syzkaller.appspotmail.com Fixes: 46cb01eeeb86 ("tipc: update mon's self addr when node addr generated") Signed-off-by: Tung Nguyen Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250417074826.578115-1-tung.quang.nguyen@est.tech Signed-off-by: Jakub Kicinski --- net/tipc/monitor.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index e2f19627e43d..b45c5b91bc7a 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -716,7 +716,8 @@ void tipc_mon_reinit_self(struct net *net) if (!mon) continue; write_lock_bh(&mon->lock); - mon->self->addr = tipc_own_addr(net); + if (mon->self) + mon->self->addr = tipc_own_addr(net); write_unlock_bh(&mon->lock); } } -- cgit v1.2.3 From 491ef1117c56476f199b481f8c68820fe4c3a7c2 Mon Sep 17 00:00:00 2001 From: Bo-Cun Chen Date: Thu, 17 Apr 2025 17:41:07 +0100 Subject: net: ethernet: mtk_eth_soc: net: revise NETSYSv3 hardware configuration Change hardware configuration for the NETSYSv3. - Enable PSE dummy page mechanism for the GDM1/2/3 - Enable PSE drop mechanism when the WDMA Rx ring full - Enable PSE no-drop mechanism for packets from the WDMA Tx - Correct PSE free drop threshold - Correct PSE CDMA high threshold Fixes: 1953f134a1a8b ("net: ethernet: mtk_eth_soc: add NETSYS_V3 version support") Signed-off-by: Bo-Cun Chen Signed-off-by: Daniel Golle Reviewed-by: Simon Horman Link: https://patch.msgid.link/b71f8fd9d4bb69c646c4d558f9331dd965068606.1744907886.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 24 ++++++++++++++++++++---- drivers/net/ethernet/mediatek/mtk_eth_soc.h | 10 +++++++++- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index bdb98c9d8b1c..47807b202310 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -4043,11 +4043,27 @@ static int mtk_hw_init(struct mtk_eth *eth, bool reset) mtk_w32(eth, 0x21021000, MTK_FE_INT_GRP); if (mtk_is_netsys_v3_or_greater(eth)) { - /* PSE should not drop port1, port8 and port9 packets */ - mtk_w32(eth, 0x00000302, PSE_DROP_CFG); + /* PSE dummy page mechanism */ + mtk_w32(eth, PSE_DUMMY_WORK_GDM(1) | PSE_DUMMY_WORK_GDM(2) | + PSE_DUMMY_WORK_GDM(3) | DUMMY_PAGE_THR, PSE_DUMY_REQ); + + /* PSE free buffer drop threshold */ + mtk_w32(eth, 0x00600009, PSE_IQ_REV(8)); + + /* PSE should not drop port8, port9 and port13 packets from + * WDMA Tx + */ + mtk_w32(eth, 0x00002300, PSE_DROP_CFG); + + /* PSE should drop packets to port8, port9 and port13 on WDMA Rx + * ring full + */ + mtk_w32(eth, 0x00002300, PSE_PPE_DROP(0)); + mtk_w32(eth, 0x00002300, PSE_PPE_DROP(1)); + mtk_w32(eth, 0x00002300, PSE_PPE_DROP(2)); /* GDM and CDM Threshold */ - mtk_w32(eth, 0x00000707, MTK_CDMW0_THRES); + mtk_w32(eth, 0x08000707, MTK_CDMW0_THRES); mtk_w32(eth, 0x00000077, MTK_CDMW1_THRES); /* Disable GDM1 RX CRC stripping */ @@ -4064,7 +4080,7 @@ static int mtk_hw_init(struct mtk_eth *eth, bool reset) mtk_w32(eth, 0x00000300, PSE_DROP_CFG); /* PSE should drop packets to port 8/9 on WDMA Rx ring full */ - mtk_w32(eth, 0x00000300, PSE_PPE0_DROP); + mtk_w32(eth, 0x00000300, PSE_PPE_DROP(0)); /* PSE Free Queue Flow Control */ mtk_w32(eth, 0x01fa01f4, PSE_FQFC_CFG2); diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.h b/drivers/net/ethernet/mediatek/mtk_eth_soc.h index 39709649ea8d..88ef2e9c50fc 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.h +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.h @@ -151,7 +151,15 @@ #define PSE_FQFC_CFG1 0x100 #define PSE_FQFC_CFG2 0x104 #define PSE_DROP_CFG 0x108 -#define PSE_PPE0_DROP 0x110 +#define PSE_PPE_DROP(x) (0x110 + ((x) * 0x4)) + +/* PSE Last FreeQ Page Request Control */ +#define PSE_DUMY_REQ 0x10C +/* PSE_DUMY_REQ is not a typo but actually called like that also in + * MediaTek's datasheet + */ +#define PSE_DUMMY_WORK_GDM(x) BIT(16 + (x)) +#define DUMMY_PAGE_THR 0x1 /* PSE Input Queue Reservation Register*/ #define PSE_IQ_REV(x) (0x140 + (((x) - 1) << 2)) -- cgit v1.2.3 From 0d039eac6e5950f9d1ecc9e410c2fd1feaeab3b6 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 23 Apr 2025 02:30:34 +0100 Subject: fix a couple of races in MNT_TREE_BENEATH handling by do_move_mount() Normally do_lock_mount(path, _) is locking a mountpoint pinned by *path and at the time when matching unlock_mount() unlocks that location it is still pinned by the same thing. Unfortunately, for 'beneath' case it's no longer that simple - the object being locked is not the one *path points to. It's the mountpoint of path->mnt. The thing is, without sufficient locking ->mnt_parent may change under us and none of the locks are held at that point. The rules are * mount_lock stabilizes m->mnt_parent for any mount m. * namespace_sem stabilizes m->mnt_parent, provided that m is mounted. * if either of the above holds and refcount of m is positive, we are guaranteed the same for refcount of m->mnt_parent. namespace_sem nests inside inode_lock(), so do_lock_mount() has to take inode_lock() before grabbing namespace_sem. It does recheck that path->mnt is still mounted in the same place after getting namespace_sem, and it does take care to pin the dentry. It is needed, since otherwise we might end up with racing mount --move (or umount) happening while we were getting locks; in that case dentry would no longer be a mountpoint and could've been evicted on memory pressure along with its inode - not something you want when grabbing lock on that inode. However, pinning a dentry is not enough - the matching mount is also pinned only by the fact that path->mnt is mounted on top it and at that point we are not holding any locks whatsoever, so the same kind of races could end up with all references to that mount gone just as we are about to enter inode_lock(). If that happens, we are left with filesystem being shut down while we are holding a dentry reference on it; results are not pretty. What we need to do is grab both dentry and mount at the same time; that makes inode_lock() safe *and* avoids the problem with fs getting shut down under us. After taking namespace_sem we verify that path->mnt is still mounted (which stabilizes its ->mnt_parent) and check that it's still mounted at the same place. From that point on to the matching namespace_unlock() we are guaranteed that mount/dentry pair we'd grabbed are also pinned by being the mountpoint of path->mnt, so we can quietly drop both the dentry reference (as the current code does) and mnt one - it's OK to do under namespace_sem, since we are not dropping the final refs. That solves the problem on do_lock_mount() side; unlock_mount() also has one, since dentry is guaranteed to stay pinned only until the namespace_unlock(). That's easy to fix - just have inode_unlock() done earlier, while it's still pinned by mp->m_dentry. Fixes: 6ac392815628 "fs: allow to mount beneath top mount" # v6.5+ Signed-off-by: Al Viro Signed-off-by: Christian Brauner --- fs/namespace.c | 69 ++++++++++++++++++++++++++++++---------------------------- 1 file changed, 36 insertions(+), 33 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index d9ca80dcc544..98a5cd756e9a 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2826,56 +2826,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct mountpoint *mp = ERR_PTR(-ENOENT); + struct path under = {}; for (;;) { - struct mount *m; + struct mount *m = real_mount(mnt); if (beneath) { - m = real_mount(mnt); + path_put(&under); read_seqlock_excl(&mount_lock); - dentry = dget(m->mnt_mountpoint); + under.mnt = mntget(&m->mnt_parent->mnt); + under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); + dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - goto out; - } - namespace_lock(); - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) + break; // not to be mounted on + + if (beneath && unlikely(m->mnt_mountpoint != dentry || + &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); - goto out; + continue; // got moved } mnt = lookup_mnt(path); - if (likely(!mnt)) + if (unlikely(mnt)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + continue; // got overmounted + } + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) break; - - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - dput(dentry); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - } - - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); + if (beneath) { + /* + * @under duplicates the references that will stay + * at least until namespace_unlock(), so the path_put() + * below is safe (and OK to do under namespace_lock - + * we are not dropping the final references here). + */ + path_put(&under); + } + return mp; } - -out: + namespace_unlock(); + inode_unlock(dentry->d_inode); if (beneath) - dput(dentry); - + path_put(&under); return mp; } @@ -2886,14 +2892,11 @@ static inline struct mountpoint *lock_mount(struct path *path) static void unlock_mount(struct mountpoint *where) { - struct dentry *dentry = where->m_dentry; - + inode_unlock(where->m_dentry->d_inode); read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); - namespace_unlock(); - inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) -- cgit v1.2.3 From 0db61388b389f43c1ba2f1cee3613feb4fd12150 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 22 Apr 2025 15:33:18 -0700 Subject: perf/core: Change to POLLERR for pinned events with error Commit: f4b07fd62d4d11d5 ("perf/core: Use POLLHUP for pinned events in error") started to emit POLLHUP for pinned events in an error state. But the POLLHUP is also used to signal events that the attached task is terminated. To distinguish pinned per-task events in the error state it would need to check if the task is live. Change it to POLLERR to make it clear. Suggested-by: Gabriel Marin Signed-off-by: Namhyung Kim Signed-off-by: Ingo Molnar Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Kan Liang Cc: Mark Rutland Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250422223318.180343-1-namhyung@kernel.org --- kernel/events/core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index e93c19565914..95e703891b24 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3943,7 +3943,7 @@ static int merge_sched_in(struct perf_event *event, void *data) perf_event_set_state(event, PERF_EVENT_STATE_ERROR); if (*perf_event_fasync(event)) - event->pending_kill = POLL_HUP; + event->pending_kill = POLL_ERR; perf_event_wakeup(event); } else { @@ -6075,7 +6075,7 @@ static __poll_t perf_poll(struct file *file, poll_table *wait) if (unlikely(READ_ONCE(event->state) == PERF_EVENT_STATE_ERROR && event->attr.pinned)) - return events; + return EPOLLERR; /* * Pin the event->rb by taking event->mmap_mutex; otherwise -- cgit v1.2.3 From cfa00a625f1c730e93f96b5b4ba7c1b4dc286c79 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Fri, 6 Dec 2024 19:45:31 +0800 Subject: drm/exynos: Remove unnecessary checking It is not needed since drm_atomic_helper_shutdown checks it. Signed-off-by: Guoqing Jiang Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_drv.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_drv.c b/drivers/gpu/drm/exynos/exynos_drm_drv.c index f313ae7bc3a3..6cc7bf77bcac 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_drv.c +++ b/drivers/gpu/drm/exynos/exynos_drm_drv.c @@ -355,8 +355,7 @@ static void exynos_drm_platform_shutdown(struct platform_device *pdev) { struct drm_device *drm = platform_get_drvdata(pdev); - if (drm) - drm_atomic_helper_shutdown(drm); + drm_atomic_helper_shutdown(drm); } static struct platform_driver exynos_drm_platform_driver = { -- cgit v1.2.3 From 0253dadc772e83aaa67aea8bf24a71e7ffe13cb0 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Thu, 6 Mar 2025 12:27:20 +0800 Subject: drm/exynos/vidi: Remove redundant error handling in vidi_get_modes() In the vidi_get_modes() function, if either drm_edid_dup() or drm_edid_alloc() fails, the function will immediately return 0, indicating that no display modes can be retrieved. However, in the event of failure in these two functions, it is still necessary to call the subsequent drm_edid_connector_update() function with a NULL drm_edid as an argument. This ensures that operations such as connector settings are performed in its callee function, _drm_edid_connector_property_update. To maintain the integrity of the operation, redundant error handling needs to be removed. Signed-off-by: Wentao Liang Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_vidi.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_vidi.c b/drivers/gpu/drm/exynos/exynos_drm_vidi.c index 08cf79a62025..e644e2382d77 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_vidi.c +++ b/drivers/gpu/drm/exynos/exynos_drm_vidi.c @@ -312,9 +312,6 @@ static int vidi_get_modes(struct drm_connector *connector) else drm_edid = drm_edid_alloc(fake_edid_info, sizeof(fake_edid_info)); - if (!drm_edid) - return 0; - drm_edid_connector_update(connector, drm_edid); count = drm_edid_connector_add_modes(connector); -- cgit v1.2.3 From 30b66dd0523df5153319a2abaa2399c7c76945cb Mon Sep 17 00:00:00 2001 From: Anindya Sundar Gayen Date: Fri, 28 Feb 2025 19:32:57 +0530 Subject: drm/exynos: fixed a spelling error Corrected a spelling mistake in the exynos_drm_fimd driver to improve code readability. No functional changes were made. Signed-off-by: Anindya Sundar Gayen Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimd.c b/drivers/gpu/drm/exynos/exynos_drm_fimd.c index 1ad87584b1c2..c394cc702d7d 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimd.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimd.c @@ -731,7 +731,7 @@ static void fimd_win_set_pixfmt(struct fimd_context *ctx, unsigned int win, /* * Setting dma-burst to 16Word causes permanent tearing for very small * buffers, e.g. cursor buffer. Burst Mode switching which based on - * plane size is not recommended as plane size varies alot towards the + * plane size is not recommended as plane size varies a lot towards the * end of the screen and rapid movement causes unstable DMA, but it is * still better to change dma-burst than displaying garbage. */ -- cgit v1.2.3 From e8de68ba86f4f84d388f2d964eba96c034120a84 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 18 Mar 2025 09:07:38 +0100 Subject: drm/exynos: exynos7_drm_decon: Consstify struct decon_data static 'struct decon_data' is only read, so it can be const for code safety. Signed-off-by: Krzysztof Kozlowski Reviewed-by: Alim Akhtar Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos7_drm_decon.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/exynos/exynos7_drm_decon.c b/drivers/gpu/drm/exynos/exynos7_drm_decon.c index 5170f72b0830..f91daefa9d2b 100644 --- a/drivers/gpu/drm/exynos/exynos7_drm_decon.c +++ b/drivers/gpu/drm/exynos/exynos7_drm_decon.c @@ -43,13 +43,13 @@ struct decon_data { unsigned int wincon_burstlen_shift; }; -static struct decon_data exynos7_decon_data = { +static const struct decon_data exynos7_decon_data = { .vidw_buf_start_base = 0x80, .shadowcon_win_protect_shift = 10, .wincon_burstlen_shift = 11, }; -static struct decon_data exynos7870_decon_data = { +static const struct decon_data exynos7870_decon_data = { .vidw_buf_start_base = 0x880, .shadowcon_win_protect_shift = 8, .wincon_burstlen_shift = 10, -- cgit v1.2.3 From c171ad1e8166ff8b3ab9ac94bad2574167b41f66 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Apr 2025 14:07:00 +0100 Subject: drm/exynos: Fix spelling mistake "enqueu" -> "enqueue" There is a spelling mistake in a DRM_DEV_DEBUG_KMS message. Fix it. Signed-off-by: Colin Ian King Signed-off-by: Inki Dae --- drivers/gpu/drm/exynos/exynos_drm_fimc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/exynos/exynos_drm_fimc.c b/drivers/gpu/drm/exynos/exynos_drm_fimc.c index b150cfd92f6e..09e33a26caaf 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_fimc.c +++ b/drivers/gpu/drm/exynos/exynos_drm_fimc.c @@ -908,7 +908,7 @@ static void fimc_dst_set_buf_seq(struct fimc_context *ctx, u32 buf_id, u32 buf_num; u32 cfg; - DRM_DEV_DEBUG_KMS(ctx->dev, "buf_id[%d]enqueu[%d]\n", buf_id, enqueue); + DRM_DEV_DEBUG_KMS(ctx->dev, "buf_id[%d]enqueue[%d]\n", buf_id, enqueue); spin_lock_irqsave(&ctx->lock, flags); -- cgit v1.2.3 From 4ce385f56434f3810ef103e1baea357ddcc6667e Mon Sep 17 00:00:00 2001 From: Juergen Gross Date: Tue, 22 Apr 2025 15:17:17 +0200 Subject: x86/mm: Fix _pgd_alloc() for Xen PV mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recently _pgd_alloc() was switched from using __get_free_pages() to pagetable_alloc_noprof(), which might return a compound page in case the allocation order is larger than 0. On x86 this will be the case if CONFIG_MITIGATION_PAGE_TABLE_ISOLATION is set, even if PTI has been disabled at runtime. When running as a Xen PV guest (this will always disable PTI), using a compound page for a PGD will result in VM_BUG_ON_PGFLAGS being triggered when the Xen code tries to pin the PGD. Fix the Xen issue together with the not needed 8k allocation for a PGD with PTI disabled by replacing PGD_ALLOCATION_ORDER with an inline helper returning the needed order for PGD allocations. Fixes: a9b3c355c2e6 ("asm-generic: pgalloc: provide generic __pgd_{alloc,free}") Reported-by: Petr Vaněk Signed-off-by: Juergen Gross Signed-off-by: Dave Hansen Tested-by: Petr Vaněk Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/20250422131717.25724-1-jgross%40suse.com --- arch/x86/include/asm/pgalloc.h | 19 +++++++++++-------- arch/x86/kernel/machine_kexec_32.c | 4 ++-- arch/x86/mm/pgtable.c | 4 ++-- arch/x86/platform/efi/efi_64.c | 4 ++-- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h index a33147520044..c88691b15f3c 100644 --- a/arch/x86/include/asm/pgalloc.h +++ b/arch/x86/include/asm/pgalloc.h @@ -6,6 +6,8 @@ #include /* for struct page */ #include +#include + #define __HAVE_ARCH_PTE_ALLOC_ONE #define __HAVE_ARCH_PGD_FREE #include @@ -29,16 +31,17 @@ static inline void paravirt_release_pud(unsigned long pfn) {} static inline void paravirt_release_p4d(unsigned long pfn) {} #endif -#ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION /* - * Instead of one PGD, we acquire two PGDs. Being order-1, it is - * both 8k in size and 8k-aligned. That lets us just flip bit 12 - * in a pointer to swap between the two 4k halves. + * In case of Page Table Isolation active, we acquire two PGDs instead of one. + * Being order-1, it is both 8k in size and 8k-aligned. That lets us just + * flip bit 12 in a pointer to swap between the two 4k halves. */ -#define PGD_ALLOCATION_ORDER 1 -#else -#define PGD_ALLOCATION_ORDER 0 -#endif +static inline unsigned int pgd_allocation_order(void) +{ + if (cpu_feature_enabled(X86_FEATURE_PTI)) + return 1; + return 0; +} /* * Allocate and free page tables. diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index 80265162aeff..1f325304c4a8 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -42,7 +42,7 @@ static void load_segments(void) static void machine_kexec_free_page_tables(struct kimage *image) { - free_pages((unsigned long)image->arch.pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)image->arch.pgd, pgd_allocation_order()); image->arch.pgd = NULL; #ifdef CONFIG_X86_PAE free_page((unsigned long)image->arch.pmd0); @@ -59,7 +59,7 @@ static void machine_kexec_free_page_tables(struct kimage *image) static int machine_kexec_alloc_page_tables(struct kimage *image) { image->arch.pgd = (pgd_t *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, - PGD_ALLOCATION_ORDER); + pgd_allocation_order()); #ifdef CONFIG_X86_PAE image->arch.pmd0 = (pmd_t *)get_zeroed_page(GFP_KERNEL); image->arch.pmd1 = (pmd_t *)get_zeroed_page(GFP_KERNEL); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index a05fcddfc811..f7ae44d3dd9e 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -360,7 +360,7 @@ static inline pgd_t *_pgd_alloc(struct mm_struct *mm) * We allocate one page for pgd. */ if (!SHARED_KERNEL_PMD) - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, pgd_allocation_order()); /* * Now PAE kernel is not running as a Xen domain. We can allocate @@ -380,7 +380,7 @@ static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) static inline pgd_t *_pgd_alloc(struct mm_struct *mm) { - return __pgd_alloc(mm, PGD_ALLOCATION_ORDER); + return __pgd_alloc(mm, pgd_allocation_order()); } static inline void _pgd_free(struct mm_struct *mm, pgd_t *pgd) diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index ac57259a432b..a4b4ebd41b8f 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -73,7 +73,7 @@ int __init efi_alloc_page_tables(void) gfp_t gfp_mask; gfp_mask = GFP_KERNEL | __GFP_ZERO; - efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, PGD_ALLOCATION_ORDER); + efi_pgd = (pgd_t *)__get_free_pages(gfp_mask, pgd_allocation_order()); if (!efi_pgd) goto fail; @@ -96,7 +96,7 @@ free_p4d: if (pgtable_l5_enabled()) free_page((unsigned long)pgd_page_vaddr(*pgd)); free_pgd: - free_pages((unsigned long)efi_pgd, PGD_ALLOCATION_ORDER); + free_pages((unsigned long)efi_pgd, pgd_allocation_order()); fail: return -ENOMEM; } -- cgit v1.2.3 From 82efd569a8909f2b13140c1b3de88535aea0b051 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 23 Apr 2025 10:21:29 +0200 Subject: locking/local_lock: fix _Generic() matching of local_trylock_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Michael Larabel reported [1] a nginx performance regression in v6.15-rc3 and bisected it to commit 51339d99c013 ("locking/local_lock, mm: replace localtry_ helpers with local_trylock_t type") The problem is the _Generic() usage with a default association that masks the fact that "local_trylock_t *" association is not being selected as expected. Replacing the default with the only other expected type "local_lock_t *" reveals the underlying problem: include/linux/local_lock_internal.h:174:26: error: ‘_Generic’ selector of type ‘__seg_gs local_lock_t *’ is not compatible with any association The local_locki's are part of __percpu structures and thus the __percpu attribute is needed to associate the type properly. Add the attribute and keep the default replaced to turn any further mismatches into compile errors. The failure to recognize local_try_lock_t in __local_lock_release() means that a local_trylock[_irqsave]() operation will set tl->acquired to 1 (there's no _Generic() part in the trylock code), but then local_unlock[_irqrestore]() will not set tl->acquired back to 0, so further trylock operations will always fail on the same cpu+lock, while non-trylock operations continue to work - a lockdep_assert() is also not being executed in the _Generic() part of local_lock() code. This means consume_stock() and refill_stock() operations will fail deterministically, resulting in taking the slow paths and worse performance. Fixes: 51339d99c013 ("locking/local_lock, mm: replace localtry_ helpers with local_trylock_t type") Reported-by: Michael Larabel Closes: https://www.phoronix.com/review/linux-615-nginx-regression/2 [1] Signed-off-by: Vlastimil Babka Acked-by: Alexei Starovoitov Signed-off-by: Linus Torvalds --- include/linux/local_lock_internal.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index bf2bf40d7b18..8d5ac16a9b17 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -102,11 +102,11 @@ do { \ l = (local_lock_t *)this_cpu_ptr(lock); \ tl = (local_trylock_t *)l; \ _Generic((lock), \ - local_trylock_t *: ({ \ + __percpu local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 0); \ WRITE_ONCE(tl->acquired, 1); \ }), \ - default:(void)0); \ + __percpu local_lock_t *: (void)0); \ local_lock_acquire(l); \ } while (0) @@ -171,11 +171,11 @@ do { \ tl = (local_trylock_t *)l; \ local_lock_release(l); \ _Generic((lock), \ - local_trylock_t *: ({ \ + __percpu local_trylock_t *: ({ \ lockdep_assert(tl->acquired == 1); \ WRITE_ONCE(tl->acquired, 0); \ }), \ - default:(void)0); \ + __percpu local_lock_t *: (void)0); \ } while (0) #define __local_unlock(lock) \ -- cgit v1.2.3 From a79be02bba5c31f967885c7f3bf3a756d77d11d9 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 23 Apr 2025 10:08:29 -0700 Subject: Fix mis-uses of 'cc-option' for warning disablement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This was triggered by one of my mis-uses causing odd build warnings on sparc in linux-next, but while figuring out why the "obviously correct" use of cc-option caused such odd breakage, I found eight other cases of the same thing in the tree. The root cause is that 'cc-option' doesn't work for checking negative warning options (ie things like '-Wno-stringop-overflow') because gcc will silently accept options it doesn't recognize, and so 'cc-option' ends up thinking they are perfectly fine. And it all works, until you have a situation where _another_ warning is emitted. At that point the compiler will go "Hmm, maybe the user intended to disable this warning but used that wrong option that I didn't recognize", and generate a warning for the unrecognized negative option. Which explains why we have several cases of this in the tree: the 'cc-option' test really doesn't work for this situation, but most of the time it simply doesn't matter that ity doesn't work. The reason my recently added case caused problems on sparc was pointed out by Thomas Weißschuh: the sparc build had a previous explicit warning that then triggered the new one. I think the best fix for this would be to make 'cc-option' a bit smarter about this sitation, possibly by adding an intentional warning to the test case that then triggers the unrecognized option warning reliably. But the short-term fix is to replace 'cc-option' with an existing helper designed for this exact case: 'cc-disable-warning', which picks the negative warning but uses the positive form for testing the compiler support. Reported-by: Stephen Rothwell Link: https://lore.kernel.org/all/20250422204718.0b4e3f81@canb.auug.org.au/ Explained-by: Thomas Weißschuh Signed-off-by: Linus Torvalds --- Makefile | 4 ++-- arch/loongarch/kernel/Makefile | 8 ++++---- arch/loongarch/kvm/Makefile | 2 +- arch/riscv/kernel/Makefile | 4 ++-- scripts/Makefile.extrawarn | 2 +- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/Makefile b/Makefile index e94bbb2298c8..07f818186151 100644 --- a/Makefile +++ b/Makefile @@ -1053,11 +1053,11 @@ NOSTDINC_FLAGS += -nostdinc KBUILD_CFLAGS += $(call cc-option, -fstrict-flex-arrays=3) #Currently, disable -Wstringop-overflow for GCC 11, globally. -KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-option, -Wno-stringop-overflow) +KBUILD_CFLAGS-$(CONFIG_CC_NO_STRINGOP_OVERFLOW) += $(call cc-disable-warning, stringop-overflow) KBUILD_CFLAGS-$(CONFIG_CC_STRINGOP_OVERFLOW) += $(call cc-option, -Wstringop-overflow) #Currently, disable -Wunterminated-string-initialization as broken -KBUILD_CFLAGS += $(call cc-option, -Wno-unterminated-string-initialization) +KBUILD_CFLAGS += $(call cc-disable-warning, unterminated-string-initialization) # disable invalid "can't wrap" optimizations for signed / pointers KBUILD_CFLAGS += -fno-strict-overflow diff --git a/arch/loongarch/kernel/Makefile b/arch/loongarch/kernel/Makefile index 4853e8b04c6f..f9dcaa60033d 100644 --- a/arch/loongarch/kernel/Makefile +++ b/arch/loongarch/kernel/Makefile @@ -21,10 +21,10 @@ obj-$(CONFIG_CPU_HAS_LBT) += lbt.o obj-$(CONFIG_ARCH_STRICT_ALIGN) += unaligned.o -CFLAGS_module.o += $(call cc-option,-Wno-override-init,) -CFLAGS_syscall.o += $(call cc-option,-Wno-override-init,) -CFLAGS_traps.o += $(call cc-option,-Wno-override-init,) -CFLAGS_perf_event.o += $(call cc-option,-Wno-override-init,) +CFLAGS_module.o += $(call cc-disable-warning, override-init) +CFLAGS_syscall.o += $(call cc-disable-warning, override-init) +CFLAGS_traps.o += $(call cc-disable-warning, override-init) +CFLAGS_perf_event.o += $(call cc-disable-warning, override-init) ifdef CONFIG_FUNCTION_TRACER ifndef CONFIG_DYNAMIC_FTRACE diff --git a/arch/loongarch/kvm/Makefile b/arch/loongarch/kvm/Makefile index f4c8e35c216a..cb41d9265662 100644 --- a/arch/loongarch/kvm/Makefile +++ b/arch/loongarch/kvm/Makefile @@ -21,4 +21,4 @@ kvm-y += intc/eiointc.o kvm-y += intc/pch_pic.o kvm-y += irqfd.o -CFLAGS_exit.o += $(call cc-option,-Wno-override-init,) +CFLAGS_exit.o += $(call cc-disable-warning, override-init) diff --git a/arch/riscv/kernel/Makefile b/arch/riscv/kernel/Makefile index 8d186bfced45..f7480c9c6f8d 100644 --- a/arch/riscv/kernel/Makefile +++ b/arch/riscv/kernel/Makefile @@ -9,8 +9,8 @@ CFLAGS_REMOVE_patch.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_sbi.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_return_address.o = $(CC_FLAGS_FTRACE) endif -CFLAGS_syscall_table.o += $(call cc-option,-Wno-override-init,) -CFLAGS_compat_syscall_table.o += $(call cc-option,-Wno-override-init,) +CFLAGS_syscall_table.o += $(call cc-disable-warning, override-init) +CFLAGS_compat_syscall_table.o += $(call cc-disable-warning, override-init) ifdef CONFIG_KEXEC_CORE AFLAGS_kexec_relocate.o := -mcmodel=medany $(call cc-option,-mno-relax) diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index d75897559d18..2d6e59561c9d 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -15,7 +15,7 @@ KBUILD_CFLAGS += -Werror=return-type KBUILD_CFLAGS += -Werror=strict-prototypes KBUILD_CFLAGS += -Wno-format-security KBUILD_CFLAGS += -Wno-trigraphs -KBUILD_CFLAGS += $(call cc-disable-warning,frame-address,) +KBUILD_CFLAGS += $(call cc-disable-warning, frame-address) KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) KBUILD_CFLAGS += -Wmissing-declarations KBUILD_CFLAGS += -Wmissing-prototypes -- cgit v1.2.3 From f2858f308131a09e33afb766cd70119b5b900569 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 16 Apr 2025 10:02:46 -0700 Subject: selftests/bpf: Mitigate sockmap_ktls disconnect_after_delete failure "sockmap_ktls disconnect_after_delete" test has been failing on BPF CI after recent merges from netdev: * https://github.com/kernel-patches/bpf/actions/runs/14458537639 * https://github.com/kernel-patches/bpf/actions/runs/14457178732 It happens because disconnect has been disabled for TLS [1], and it renders the test case invalid. Removing all the test code creates a conflict between bpf and bpf-next, so for now only remove the offending assert [2]. The test will be removed later on bpf-next. [1] https://lore.kernel.org/netdev/20250404180334.3224206-1-kuba@kernel.org/ [2] https://lore.kernel.org/bpf/cfc371285323e1a3f3b006bfcf74e6cf7ad65258@linux.dev/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Reviewed-by: Jiayuan Chen Link: https://lore.kernel.org/bpf/20250416170246.2438524-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index 2d0796314862..0a99fd404f6d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -68,7 +68,6 @@ static void test_sockmap_ktls_disconnect_after_delete(int family, int map) goto close_cli; err = disconnect(cli); - ASSERT_OK(err, "disconnect"); close_cli: close(cli); -- cgit v1.2.3 From c0e473a0d226479e8e925d5ba93f751d8df628e9 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 23 Apr 2025 12:53:42 -0700 Subject: block: fix race between set_blocksize and read paths With the new large sector size support, it's now the case that set_blocksize can change i_blksize and the folio order in a manner that conflicts with a concurrent reader and causes a kernel crash. Specifically, let's say that udev-worker calls libblkid to detect the labels on a block device. The read call can create an order-0 folio to read the first 4096 bytes from the disk. But then udev is preempted. Next, someone tries to mount an 8k-sectorsize filesystem from the same block device. The filesystem calls set_blksize, which sets i_blksize to 8192 and the minimum folio order to 1. Now udev resumes, still holding the order-0 folio it allocated. It then tries to schedule a read bio and do_mpage_readahead tries to create bufferheads for the folio. Unfortunately, blocks_per_folio == 0 because the page size is 4096 but the blocksize is 8192 so no bufferheads are attached and the bh walk never sets bdev. We then submit the bio with a NULL block device and crash. Therefore, truncate the page cache after flushing but before updating i_blksize. However, that's not enough -- we also need to lock out file IO and page faults during the update. Take both the i_rwsem and the invalidate_lock in exclusive mode for invalidations, and in shared mode for read/write operations. I don't know if this is the correct fix, but xfs/259 found it. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Tested-by: Shin'ichiro Kawasaki Link: https://lore.kernel.org/r/174543795699.4139148.2086129139322431423.stgit@frogsfrogsfrogs Signed-off-by: Jens Axboe --- block/bdev.c | 17 +++++++++++++++++ block/blk-zoned.c | 5 ++++- block/fops.c | 16 ++++++++++++++++ block/ioctl.c | 6 ++++++ 4 files changed, 43 insertions(+), 1 deletion(-) diff --git a/block/bdev.c b/block/bdev.c index 4844d1e27b6f..3c856e56d5df 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -169,10 +169,27 @@ int set_blocksize(struct file *file, int size) /* Don't change the size if it is same as current */ if (inode->i_blkbits != blksize_bits(size)) { + /* + * Flush and truncate the pagecache before we reconfigure the + * mapping geometry because folio sizes are variable now. If a + * reader has already allocated a folio whose size is smaller + * than the new min_order but invokes readahead after the new + * min_order becomes visible, readahead will think there are + * "zero" blocks per folio and crash. Take the inode and + * invalidation locks to avoid racing with + * read/write/fallocate. + */ + inode_lock(inode); + filemap_invalidate_lock(inode->i_mapping); + sync_blockdev(bdev); + kill_bdev(bdev); + inode->i_blkbits = blksize_bits(size); mapping_set_folio_min_order(inode->i_mapping, get_order(size)); kill_bdev(bdev); + filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); } return 0; } diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 0c77244a35c9..8f15d1aa6eb8 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -343,6 +343,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, op = REQ_OP_ZONE_RESET; /* Invalidate the page cache, including dirty pages. */ + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); ret = blkdev_truncate_zone_range(bdev, mode, &zrange); if (ret) @@ -364,8 +365,10 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors); fail: - if (cmd == BLKRESETZONE) + if (cmd == BLKRESETZONE) { filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); + } return ret; } diff --git a/block/fops.c b/block/fops.c index be9f1dbea9ce..e221fdcaa8aa 100644 --- a/block/fops.c +++ b/block/fops.c @@ -746,7 +746,14 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) ret = direct_write_fallback(iocb, from, ret, blkdev_buffered_write(iocb, from)); } else { + /* + * Take i_rwsem and invalidate_lock to avoid racing with + * set_blocksize changing i_blkbits/folio order and punching + * out the pagecache. + */ + inode_lock_shared(bd_inode); ret = blkdev_buffered_write(iocb, from); + inode_unlock_shared(bd_inode); } if (ret > 0) @@ -757,6 +764,7 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { + struct inode *bd_inode = bdev_file_inode(iocb->ki_filp); struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; @@ -793,7 +801,13 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) goto reexpand; } + /* + * Take i_rwsem and invalidate_lock to avoid racing with set_blocksize + * changing i_blkbits/folio order and punching out the pagecache. + */ + inode_lock_shared(bd_inode); ret = filemap_read(iocb, to, ret); + inode_unlock_shared(bd_inode); reexpand: if (unlikely(shorted)) @@ -836,6 +850,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, if ((start | len) & (bdev_logical_block_size(bdev) - 1)) return -EINVAL; + inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); /* @@ -868,6 +883,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, fail: filemap_invalidate_unlock(inode->i_mapping); + inode_unlock(inode); return error; } diff --git a/block/ioctl.c b/block/ioctl.c index faa40f383e27..e472cc1030c6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -142,6 +142,7 @@ static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, if (err) return err; + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, start + len - 1); if (err) @@ -174,6 +175,7 @@ out_unplug: blk_finish_plug(&plug); fail: filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -199,12 +201,14 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, end > bdev_nr_bytes(bdev)) return -EINVAL; + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } @@ -236,6 +240,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, return -EINVAL; /* Invalidate the page cache, including dirty pages */ + inode_lock(bdev->bd_mapping->host); filemap_invalidate_lock(bdev->bd_mapping); err = truncate_bdev_range(bdev, mode, start, end); if (err) @@ -246,6 +251,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, fail: filemap_invalidate_unlock(bdev->bd_mapping); + inode_unlock(bdev->bd_mapping->host); return err; } -- cgit v1.2.3 From e03463d247ddac66e71143468373df3d74a3a6bd Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 23 Apr 2025 12:53:57 -0700 Subject: block: hoist block size validation code to a separate function Hoist the block size validation code to bdev_validate_blocksize so that we can call it from filesystems that don't care about the bdev pagecache manipulations of set_blocksize. Signed-off-by: Darrick J. Wong Reviewed-by: Luis Chamberlain Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/174543795720.4139148.840349813093799165.stgit@frogsfrogsfrogs Signed-off-by: Jens Axboe --- block/bdev.c | 33 +++++++++++++++++++++++++++------ include/linux/blkdev.h | 1 + 2 files changed, 28 insertions(+), 6 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index 3c856e56d5df..e29b3b6d1707 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -152,18 +152,39 @@ static void set_init_blocksize(struct block_device *bdev) get_order(bsize)); } -int set_blocksize(struct file *file, int size) +/** + * bdev_validate_blocksize - check that this block size is acceptable + * @bdev: blockdevice to check + * @block_size: block size to check + * + * For block device users that do not use buffer heads or the block device + * page cache, make sure that this block size can be used with the device. + * + * Return: On success zero is returned, negative error code on failure. + */ +int bdev_validate_blocksize(struct block_device *bdev, int block_size) { - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = I_BDEV(inode); - - if (blk_validate_block_size(size)) + if (blk_validate_block_size(block_size)) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_logical_block_size(bdev)) + if (block_size < bdev_logical_block_size(bdev)) return -EINVAL; + return 0; +} +EXPORT_SYMBOL_GPL(bdev_validate_blocksize); + +int set_blocksize(struct file *file, int size) +{ + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = I_BDEV(inode); + int ret; + + ret = bdev_validate_blocksize(bdev, size); + if (ret) + return ret; + if (!file->private_data) return -EINVAL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e39c45bc0a97..c6b478cfb32d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1614,6 +1614,7 @@ static inline void bio_end_io_acct(struct bio *bio, unsigned long start_time) return bio_end_io_acct_remapped(bio, start_time, bio->bi_bdev); } +int bdev_validate_blocksize(struct block_device *bdev, int block_size); int set_blocksize(struct file *file, int size); int lookup_bdev(const char *pathname, dev_t *dev); -- cgit v1.2.3 From 5533bc70aedc7c9872841ac8649344f8cbc6bc4c Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 22 Apr 2025 07:59:41 +0800 Subject: selftests: ublk: fix recover test When adding recovery test: - 'break' is missed for handling '-g' argument - test name of test_generic_05.sh is wrong So fix the two. Fixes: 57e13a2e8cd2 ("selftests: ublk: support user recovery") Signed-off-by: Ming Lei Reviewed-by: Uday Shankar Link: https://lore.kernel.org/r/20250421235947.715272-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.c | 1 + tools/testing/selftests/ublk/test_generic_05.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/ublk/kublk.c b/tools/testing/selftests/ublk/kublk.c index 759f06637146..e57a1486bb48 100644 --- a/tools/testing/selftests/ublk/kublk.c +++ b/tools/testing/selftests/ublk/kublk.c @@ -1354,6 +1354,7 @@ int main(int argc, char *argv[]) value = strtol(optarg, NULL, 10); if (value) ctx.flags |= UBLK_F_NEED_GET_DATA; + break; case 0: if (!strcmp(longopts[option_idx].name, "debug_mask")) ublk_dbg_mask = strtol(optarg, NULL, 16); diff --git a/tools/testing/selftests/ublk/test_generic_05.sh b/tools/testing/selftests/ublk/test_generic_05.sh index 714630b4b329..3bb00a347402 100755 --- a/tools/testing/selftests/ublk/test_generic_05.sh +++ b/tools/testing/selftests/ublk/test_generic_05.sh @@ -3,7 +3,7 @@ . "$(cd "$(dirname "$0")" && pwd)"/test_common.sh -TID="generic_04" +TID="generic_05" ERR_CODE=0 ublk_run_recover_test() -- cgit v1.2.3 From 8f503637898313c048bf21e386e09be90e30cc31 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Tue, 22 Apr 2025 07:59:42 +0800 Subject: selftests: ublk: remove useless 'delay_us' from 'struct dev_ctx' 'delay_us' shouldn't be added to 'struct dev_ctx' since now it is handled by per-target command line & 'struct fault_inject_ctx'. So remove it. Fixes: 81586652bb1f ("selftests: ublk: add generic_06 for covering fault inject") Signed-off-by: Ming Lei Reviewed-by: Uday Shankar Link: https://lore.kernel.org/r/20250421235947.715272-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/kublk.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/tools/testing/selftests/ublk/kublk.h b/tools/testing/selftests/ublk/kublk.h index 29571eb296f1..918db5cd633f 100644 --- a/tools/testing/selftests/ublk/kublk.h +++ b/tools/testing/selftests/ublk/kublk.h @@ -86,9 +86,6 @@ struct dev_ctx { unsigned int fg:1; unsigned int recovery:1; - /* fault_inject */ - long long delay_us; - int _evtfd; int _shmid; -- cgit v1.2.3 From 442cacac2d9935a0698332a568afcb5c6ab8be17 Mon Sep 17 00:00:00 2001 From: Niklas Cassel Date: Wed, 16 Apr 2025 16:28:26 +0200 Subject: misc: pci_endpoint_test: Defer IRQ allocation until ioctl(PCITEST_SET_IRQTYPE) Commit a402006d48a9 ("misc: pci_endpoint_test: Remove global 'irq_type' and 'no_msi'") changed so that the default IRQ vector requested by pci_endpoint_test_probe() was no longer the module param 'irq_type', but instead test->irq_type. test->irq_type is by default IRQ_TYPE_UNDEFINED (until someone calls ioctl(PCITEST_SET_IRQTYPE)). However, the commit also changed so that after initializing test->irq_type to IRQ_TYPE_UNDEFINED, it also overrides it with driver_data->irq_type, if the PCI device and vendor ID provides driver_data. This causes a regression for PCI device and vendor IDs that do not provide driver_data, and the host side pci_endpoint_test_driver driver failed to probe on such platforms: pci-endpoint-test 0001:01:00.0: Invalid IRQ type selected pci-endpoint-test 0001:01:00.0: probe with driver pci-endpoint-test failed with error -22 Considering that the pci endpoint selftests and the old pcitest.sh always call ioctl(PCITEST_SET_IRQTYPE) before performing any test that requires IRQs, fix the regression by removing the allocation of IRQs in pci_endpoint_test_probe(). The IRQ allocation will occur when ioctl(PCITEST_SET_IRQTYPE) is called. A positive side effect of this is that even if the endpoint controller has issues with IRQs, the user can do still do all the tests/ioctls() that do not require working IRQs, e.g. PCITEST_BAR and PCITEST_BARS. This also means that we can remove the now unused irq_type from driver_data. The irq_type will always be the one configured by the user using ioctl(PCITEST_SET_IRQTYPE). (A user that does not know, or care which irq_type that is used, can use PCITEST_IRQ_TYPE_AUTO. This has superseded the need for a default irq_type in driver_data.) [bhelgaas: add probe failure details] Fixes: a402006d48a9c ("misc: pci_endpoint_test: Remove global 'irq_type' and 'no_msi'") Signed-off-by: Niklas Cassel Signed-off-by: Manivannan Sadhasivam Signed-off-by: Bjorn Helgaas Tested-by: Frank Li Reviewed-by: Manivannan Sadhasivam Reviewed-by: Frank Li Link: https://patch.msgid.link/20250416142825.336554-2-cassel@kernel.org --- drivers/misc/pci_endpoint_test.c | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/drivers/misc/pci_endpoint_test.c b/drivers/misc/pci_endpoint_test.c index d294850a35a1..c4e5e2c977be 100644 --- a/drivers/misc/pci_endpoint_test.c +++ b/drivers/misc/pci_endpoint_test.c @@ -122,7 +122,6 @@ struct pci_endpoint_test { struct pci_endpoint_test_data { enum pci_barno test_reg_bar; size_t alignment; - int irq_type; }; static inline u32 pci_endpoint_test_readl(struct pci_endpoint_test *test, @@ -948,7 +947,6 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, test_reg_bar = data->test_reg_bar; test->test_reg_bar = test_reg_bar; test->alignment = data->alignment; - test->irq_type = data->irq_type; } init_completion(&test->irq_raised); @@ -970,10 +968,6 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, pci_set_master(pdev); - ret = pci_endpoint_test_alloc_irq_vectors(test, test->irq_type); - if (ret) - goto err_disable_irq; - for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { if (pci_resource_flags(pdev, bar) & IORESOURCE_MEM) { base = pci_ioremap_bar(pdev, bar); @@ -1009,10 +1003,6 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, goto err_ida_remove; } - ret = pci_endpoint_test_request_irq(test); - if (ret) - goto err_kfree_test_name; - pci_endpoint_test_get_capabilities(test); misc_device = &test->miscdev; @@ -1020,7 +1010,7 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, misc_device->name = kstrdup(name, GFP_KERNEL); if (!misc_device->name) { ret = -ENOMEM; - goto err_release_irq; + goto err_kfree_test_name; } misc_device->parent = &pdev->dev; misc_device->fops = &pci_endpoint_test_fops; @@ -1036,9 +1026,6 @@ static int pci_endpoint_test_probe(struct pci_dev *pdev, err_kfree_name: kfree(misc_device->name); -err_release_irq: - pci_endpoint_test_release_irq(test); - err_kfree_test_name: kfree(test->name); @@ -1051,8 +1038,6 @@ err_iounmap: pci_iounmap(pdev, test->bar[bar]); } -err_disable_irq: - pci_endpoint_test_free_irq_vectors(test); pci_release_regions(pdev); err_disable_pdev: @@ -1092,23 +1077,19 @@ static void pci_endpoint_test_remove(struct pci_dev *pdev) static const struct pci_endpoint_test_data default_data = { .test_reg_bar = BAR_0, .alignment = SZ_4K, - .irq_type = PCITEST_IRQ_TYPE_MSI, }; static const struct pci_endpoint_test_data am654_data = { .test_reg_bar = BAR_2, .alignment = SZ_64K, - .irq_type = PCITEST_IRQ_TYPE_MSI, }; static const struct pci_endpoint_test_data j721e_data = { .alignment = 256, - .irq_type = PCITEST_IRQ_TYPE_MSI, }; static const struct pci_endpoint_test_data rk3588_data = { .alignment = SZ_64K, - .irq_type = PCITEST_IRQ_TYPE_MSI, }; /* -- cgit v1.2.3 From 13b4ece33cf9def67966bb8716783c42cec20617 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Mon, 21 Apr 2025 19:07:13 +0200 Subject: mptcp: pm: Defer freeing of MPTCP userspace path manager entries When path manager entries are deleted from the local address list, they are first unlinked from the address list using list_del_rcu(). The entries must not be freed until after the RCU grace period, but the existing code immediately frees the entry. Use kfree_rcu_mightsleep() and adjust sk_omem_alloc in open code instead of using the sock_kfree_s() helper. This code path is only called in a netlink handler, so the "might sleep" function is preferable to adding a rarely-used rcu_head member to struct mptcp_pm_addr_entry. Fixes: 88d097316371 ("mptcp: drop free_list for deleting entries") Cc: stable@vger.kernel.org Signed-off-by: Mat Martineau Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250421-net-mptcp-pm-defer-freeing-v1-1-e731dc6e86b9@kernel.org Signed-off-by: Jakub Kicinski --- net/mptcp/pm_userspace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 2cb62f026b1f..a715dcbe0146 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -337,7 +337,11 @@ int mptcp_pm_nl_remove_doit(struct sk_buff *skb, struct genl_info *info) release_sock(sk); - sock_kfree_s(sk, match, sizeof(*match)); + kfree_rcu_mightsleep(match); + /* Adjust sk_omem_alloc like sock_kfree_s() does, to match + * with allocation of this memory by sock_kmemdup() + */ + atomic_sub(sizeof(*match), &sk->sk_omem_alloc); err = 0; out: -- cgit v1.2.3 From ce72fea219c13c6485503928181c547d0e26756b Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Mon, 21 Apr 2025 19:07:14 +0200 Subject: selftests: mptcp: diag: use mptcp_lib_get_info_value When running diag.sh in a loop, chk_dump_one will report the following "grep: write error": 13 ....chk 2 cestab [ OK ] grep: write error 14 ....chk dump_one [ OK ] 15 ....chk 2->0 msk in use after flush [ OK ] 16 ....chk 2->0 cestab after flush [ OK ] This error is caused by a broken pipe. When the output of 'ss' is processed by grep, 'head -n 1' will exit immediately after getting the first line, causing the subsequent pipe to close. At this time, if 'grep' is still trying to write data to the closed pipe, it will trigger a SIGPIPE signal, causing a write error. One solution is not to use this problematic "head -n 1" command, but to use mptcp_lib_get_info_value() helper defined in mptcp_lib.sh to get the value of 'token'. Fixes: ba2400166570 ("selftests: mptcp: add a test for mptcp_diag_dump_one") Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Tested-by: Gang Yan Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20250421-net-mptcp-pm-defer-freeing-v1-2-e731dc6e86b9@kernel.org Signed-off-by: Jakub Kicinski --- tools/testing/selftests/net/mptcp/diag.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/net/mptcp/diag.sh b/tools/testing/selftests/net/mptcp/diag.sh index 4f55477ffe08..e7a75341f0f3 100755 --- a/tools/testing/selftests/net/mptcp/diag.sh +++ b/tools/testing/selftests/net/mptcp/diag.sh @@ -206,9 +206,8 @@ chk_dump_one() local token local msg - ss_token="$(ss -inmHMN $ns | grep 'token:' |\ - head -n 1 |\ - sed 's/.*token:\([0-9a-f]*\).*/\1/')" + ss_token="$(ss -inmHMN $ns | + mptcp_lib_get_info_value "token" "token")" token="$(ip netns exec $ns ./mptcp_diag -t $ss_token |\ awk -F':[ \t]+' '/^token/ {print $2}')" -- cgit v1.2.3 From 3df275ef0a6ae181e8428a6589ef5d5231e58b5c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Apr 2025 11:47:30 -0700 Subject: net_sched: hfsc: Fix a UAF vulnerability in class handling This patch fixes a Use-After-Free vulnerability in the HFSC qdisc class handling. The issue occurs due to a time-of-check/time-of-use condition in hfsc_change_class() when working with certain child qdiscs like netem or codel. The vulnerability works as follows: 1. hfsc_change_class() checks if a class has packets (q.qlen != 0) 2. It then calls qdisc_peek_len(), which for certain qdiscs (e.g., codel, netem) might drop packets and empty the queue 3. The code continues assuming the queue is still non-empty, adding the class to vttree 4. This breaks HFSC scheduler assumptions that only non-empty classes are in vttree 5. Later, when the class is destroyed, this can lead to a Use-After-Free The fix adds a second queue length check after qdisc_peek_len() to verify the queue wasn't emptied. Fixes: 21f4d5cc25ec ("net_sched/hfsc: fix curve activation in hfsc_change_class()") Reported-by: Gerrard Tai Reviewed-by: Konstantin Khlebnikov Signed-off-by: Cong Wang Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250417184732.943057-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_hfsc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ce5045eea065..b368ac0595d5 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -961,6 +961,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl != NULL) { int old_flags; + int len = 0; if (parentid) { if (cl->cl_parent && @@ -991,9 +992,13 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (usc != NULL) hfsc_change_usc(cl, usc, cur_time); + if (cl->qdisc->q.qlen != 0) + len = qdisc_peek_len(cl->qdisc); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ if (cl->qdisc->q.qlen != 0) { - int len = qdisc_peek_len(cl->qdisc); - if (cl->cl_flags & HFSC_RSC) { if (old_flags & HFSC_RSC) update_ed(cl, len); -- cgit v1.2.3 From 6ccbda44e2cc3d26fd22af54c650d6d5d801addf Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Apr 2025 11:47:31 -0700 Subject: net_sched: hfsc: Fix a potential UAF in hfsc_dequeue() too Similarly to the previous patch, we need to safe guard hfsc_dequeue() too. But for this one, we don't have a reliable reproducer. Fixes: 1da177e4c3f41524e886b7f1b8a0c1fc7321cac2 ("Linux-2.6.12-rc2") Reported-by: Gerrard Tai Signed-off-by: Cong Wang Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250417184732.943057-3-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- net/sched/sch_hfsc.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index b368ac0595d5..6c8ef826cec0 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1641,10 +1641,16 @@ hfsc_dequeue(struct Qdisc *sch) if (cl->qdisc->q.qlen != 0) { /* update ed */ next_len = qdisc_peek_len(cl->qdisc); - if (realtime) - update_ed(cl, next_len); - else - update_d(cl, next_len); + /* Check queue length again since some qdisc implementations + * (e.g., netem/codel) might empty the queue during the peek + * operation. + */ + if (cl->qdisc->q.qlen != 0) { + if (realtime) + update_ed(cl, next_len); + else + update_d(cl, next_len); + } } else { /* the class becomes passive */ eltree_remove(cl); -- cgit v1.2.3 From 7629d1a04ad2e76709401b655263040486972c2c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 17 Apr 2025 11:47:32 -0700 Subject: selftests/tc-testing: Add test for HFSC queue emptying during peek operation Add a selftest to exercise the condition where qdisc implementations like netem or codel might empty the queue during a peek operation. This tests the defensive code path in HFSC that checks the queue length again after peeking to handle this case. Based on the reproducer from Gerrard, improved by Jamal. Reported-by: Gerrard Tai Signed-off-by: Cong Wang Tested-by: Victor Nogueira Reviewed-by: Jamal Hadi Salim Link: https://patch.msgid.link/20250417184732.943057-4-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski --- .../tc-testing/tc-tests/infra/qdiscs.json | 39 ++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json index d4ea9cd845a3..e26bbc169783 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json +++ b/tools/testing/selftests/tc-testing/tc-tests/infra/qdiscs.json @@ -313,5 +313,44 @@ "$TC qdisc del dev $DUMMY handle 1: root", "$IP addr del 10.10.10.10/24 dev $DUMMY || true" ] + }, + { + "id": "a4c3", + "name": "Test HFSC with netem/blackhole - queue emptying during peek operation", + "category": [ + "qdisc", + "hfsc", + "netem", + "blackhole" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "$IP link set dev $DUMMY up || true", + "$IP addr add 10.10.10.10/24 dev $DUMMY || true", + "$TC qdisc add dev $DUMMY handle 1:0 root drr", + "$TC class add dev $DUMMY parent 1:0 classid 1:1 drr", + "$TC class add dev $DUMMY parent 1:0 classid 1:2 drr", + "$TC qdisc add dev $DUMMY parent 1:1 handle 2:0 plug limit 1024", + "$TC qdisc add dev $DUMMY parent 1:2 handle 3:0 hfsc default 1", + "$TC class add dev $DUMMY parent 3:0 classid 3:1 hfsc rt m1 5Mbit d 10ms m2 10Mbit", + "$TC qdisc add dev $DUMMY parent 3:1 handle 4:0 netem delay 1ms", + "$TC qdisc add dev $DUMMY parent 4:1 handle 5:0 blackhole", + "ping -c 3 -W 0.01 -i 0.001 -s 1 10.10.10.10 -I $DUMMY > /dev/null 2>&1 || true", + "$TC class change dev $DUMMY parent 3:0 classid 3:1 hfsc sc m1 5Mbit d 10ms m2 10Mbit", + "$TC class del dev $DUMMY parent 3:0 classid 3:1", + "$TC class add dev $DUMMY parent 3:0 classid 3:1 hfsc rt m1 5Mbit d 10ms m2 10Mbit", + "ping -c 3 -W 0.01 -i 0.001 -s 1 10.10.10.10 -I $DUMMY > /dev/null 2>&1 || true" + ], + "cmdUnderTest": "$TC class change dev $DUMMY parent 3:0 classid 3:1 hfsc sc m1 5Mbit d 10ms m2 10Mbit", + "expExitCode": "0", + "verifyCmd": "$TC -s qdisc show dev $DUMMY", + "matchPattern": "qdisc hfsc 3:.*parent 1:2.*default 1", + "matchCount": "1", + "teardown": [ + "$TC qdisc del dev $DUMMY handle 1:0 root", + "$IP addr del 10.10.10.10/24 dev $DUMMY || true" + ] } ] -- cgit v1.2.3 From 497041d763016c2e8314d2f6a329a9b77c3797ca Mon Sep 17 00:00:00 2001 From: Daniel Golle Date: Tue, 22 Apr 2025 04:10:20 +0100 Subject: net: dsa: mt7530: sync driver-specific behavior of MT7531 variants MT7531 standalone and MMIO variants found in MT7988 and EN7581 share most basic properties. Despite that, assisted_learning_on_cpu_port and mtu_enforcement_ingress were only applied for MT7531 but not for MT7988 or EN7581, causing the expected issues on MMIO devices. Apply both settings equally also for MT7988 and EN7581 by moving both assignments form mt7531_setup() to mt7531_setup_common(). This fixes unwanted flooding of packets due to unknown unicast during DA lookup, as well as issues with heterogenous MTU settings. Fixes: 7f54cc9772ce ("net: dsa: mt7530: split-off common parts from mt7531_setup") Signed-off-by: Daniel Golle Reviewed-by: Chester A. Unal Link: https://patch.msgid.link/89ed7ec6d4fa0395ac53ad2809742bb1ce61ed12.1745290867.git.daniel@makrotopia.org Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mt7530.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c index d70399bce5b9..c5d6628d7980 100644 --- a/drivers/net/dsa/mt7530.c +++ b/drivers/net/dsa/mt7530.c @@ -2419,6 +2419,9 @@ mt7531_setup_common(struct dsa_switch *ds) struct mt7530_priv *priv = ds->priv; int ret, i; + ds->assisted_learning_on_cpu_port = true; + ds->mtu_enforcement_ingress = true; + mt753x_trap_frames(priv); /* Enable and reset MIB counters */ @@ -2571,9 +2574,6 @@ mt7531_setup(struct dsa_switch *ds) if (ret) return ret; - ds->assisted_learning_on_cpu_port = true; - ds->mtu_enforcement_ingress = true; - return 0; } -- cgit v1.2.3 From d9e2f070d8af60f2c8c02b2ddf0a9e90b4e9220c Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 21 Apr 2025 10:46:03 -0700 Subject: pds_core: Prevent possible adminq overflow/stuck condition The pds_core's adminq is protected by the adminq_lock, which prevents more than 1 command to be posted onto it at any one time. This makes it so the client drivers cannot simultaneously post adminq commands. However, the completions happen in a different context, which means multiple adminq commands can be posted sequentially and all waiting on completion. On the FW side, the backing adminq request queue is only 16 entries long and the retry mechanism and/or overflow/stuck prevention is lacking. This can cause the adminq to get stuck, so commands are no longer processed and completions are no longer sent by the FW. As an initial fix, prevent more than 16 outstanding adminq commands so there's no way to cause the adminq from getting stuck. This works because the backing adminq request queue will never have more than 16 pending adminq commands, so it will never overflow. This is done by reducing the adminq depth to 16. Fixes: 45d76f492938 ("pds_core: set up device and adminq") Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250421174606.3892-2-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/core.c | 5 +---- drivers/net/ethernet/amd/pds_core/core.h | 2 +- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 1eb0d92786f7..55163457f12b 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -325,10 +325,7 @@ static int pdsc_core_init(struct pdsc *pdsc) size_t sz; int err; - /* Scale the descriptor ring length based on number of CPUs and VFs */ - numdescs = max_t(int, PDSC_ADMINQ_MIN_LENGTH, num_online_cpus()); - numdescs += 2 * pci_sriov_get_totalvfs(pdsc->pdev); - numdescs = roundup_pow_of_two(numdescs); + numdescs = PDSC_ADMINQ_MAX_LENGTH; err = pdsc_qcq_alloc(pdsc, PDS_CORE_QTYPE_ADMINQ, 0, "adminq", PDS_CORE_QCQ_F_CORE | PDS_CORE_QCQ_F_INTR, numdescs, diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index 0bf320c43083..199473112c29 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -16,7 +16,7 @@ #define PDSC_WATCHDOG_SECS 5 #define PDSC_QUEUE_NAME_MAX_SZ 16 -#define PDSC_ADMINQ_MIN_LENGTH 16 /* must be a power of two */ +#define PDSC_ADMINQ_MAX_LENGTH 16 /* must be a power of two */ #define PDSC_NOTIFYQ_LENGTH 64 /* must be a power of two */ #define PDSC_TEARDOWN_RECOVERY false #define PDSC_TEARDOWN_REMOVING true -- cgit v1.2.3 From 2567daad69cd1107fc0ec29b1615f110d7cf7385 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 21 Apr 2025 10:46:04 -0700 Subject: pds_core: handle unsupported PDS_CORE_CMD_FW_CONTROL result If the FW doesn't support the PDS_CORE_CMD_FW_CONTROL command the driver might at the least print garbage and at the worst crash when the user runs the "devlink dev info" devlink command. This happens because the stack variable fw_list is not 0 initialized which results in fw_list.num_fw_slots being a garbage value from the stack. Then the driver tries to access fw_list.fw_names[i] with i >= ARRAY_SIZE and runs off the end of the array. Fix this by initializing the fw_list and by not failing completely if the devcmd fails because other useful information is printed via devlink dev info even if the devcmd fails. Fixes: 45d76f492938 ("pds_core: set up device and adminq") Signed-off-by: Brett Creeley Reviewed-by: Simon Horman Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250421174606.3892-3-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/devlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/devlink.c b/drivers/net/ethernet/amd/pds_core/devlink.c index c5c787df61a4..d8dc39da4161 100644 --- a/drivers/net/ethernet/amd/pds_core/devlink.c +++ b/drivers/net/ethernet/amd/pds_core/devlink.c @@ -105,7 +105,7 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, .fw_control.opcode = PDS_CORE_CMD_FW_CONTROL, .fw_control.oper = PDS_CORE_FW_GET_LIST, }; - struct pds_core_fw_list_info fw_list; + struct pds_core_fw_list_info fw_list = {}; struct pdsc *pdsc = devlink_priv(dl); union pds_core_dev_comp comp; char buf[32]; @@ -118,8 +118,6 @@ int pdsc_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (!err) memcpy_fromio(&fw_list, pdsc->cmd_regs->data, sizeof(fw_list)); mutex_unlock(&pdsc->devcmd_lock); - if (err && err != -EIO) - return err; listlen = min(fw_list.num_fw_slots, ARRAY_SIZE(fw_list.fw_names)); for (i = 0; i < listlen; i++) { -- cgit v1.2.3 From f9559d818205a4a0b9cd87181ef46e101ea11157 Mon Sep 17 00:00:00 2001 From: Brett Creeley Date: Mon, 21 Apr 2025 10:46:05 -0700 Subject: pds_core: Remove unnecessary check in pds_client_adminq_cmd() When the pds_core driver was first created there were some race conditions around using the adminq, especially for client drivers. To reduce the possibility of a race condition there's a check against pf->state in pds_client_adminq_cmd(). This is problematic for a couple of reasons: 1. The PDSC_S_INITING_DRIVER bit is set during probe, but not cleared until after everything in probe is complete, which includes creating the auxiliary devices. For pds_fwctl this means it can't make any adminq commands until after pds_core's probe is complete even though the adminq is fully up by the time pds_fwctl's auxiliary device is created. 2. The race conditions around using the adminq have been fixed and this path is already protected against client drivers calling pds_client_adminq_cmd() if the adminq isn't ready, i.e. see pdsc_adminq_post() -> pdsc_adminq_inc_if_up(). Fix this by removing the pf->state check in pds_client_adminq_cmd() because invalid accesses to pds_core's adminq is already handled by pdsc_adminq_post()->pdsc_adminq_inc_if_up(). Fixes: 10659034c622 ("pds_core: add the aux client API") Reviewed-by: Simon Horman Signed-off-by: Brett Creeley Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250421174606.3892-4-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/auxbus.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/auxbus.c b/drivers/net/ethernet/amd/pds_core/auxbus.c index eeb72b1809ea..c9aac27883a3 100644 --- a/drivers/net/ethernet/amd/pds_core/auxbus.c +++ b/drivers/net/ethernet/amd/pds_core/auxbus.c @@ -107,9 +107,6 @@ int pds_client_adminq_cmd(struct pds_auxiliary_dev *padev, dev_dbg(pf->dev, "%s: %s opcode %d\n", __func__, dev_name(&padev->aux_dev.dev), req->opcode); - if (pf->state) - return -ENXIO; - /* Wrap the client's request */ cmd.client_request.opcode = PDS_AQ_CMD_CLIENT_CMD; cmd.client_request.client_id = cpu_to_le16(padev->client_id); -- cgit v1.2.3 From 3f77c3dfffc7063428b100c4945ca2a7a8680380 Mon Sep 17 00:00:00 2001 From: Shannon Nelson Date: Mon, 21 Apr 2025 10:46:06 -0700 Subject: pds_core: make wait_context part of q_info Make the wait_context a full part of the q_info struct rather than a stack variable that goes away after pdsc_adminq_post() is done so that the context is still available after the wait loop has given up. There was a case where a slow development firmware caused the adminq request to time out, but then later the FW finally finished the request and sent the interrupt. The handler tried to complete_all() the completion context that had been created on the stack in pdsc_adminq_post() but no longer existed. This caused bad pointer usage, kernel crashes, and much wailing and gnashing of teeth. Fixes: 01ba61b55b20 ("pds_core: Add adminq processing and commands") Reviewed-by: Simon Horman Signed-off-by: Shannon Nelson Reviewed-by: Jacob Keller Link: https://patch.msgid.link/20250421174606.3892-5-shannon.nelson@amd.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/amd/pds_core/adminq.c | 36 ++++++++++++------------------ drivers/net/ethernet/amd/pds_core/core.c | 4 +++- drivers/net/ethernet/amd/pds_core/core.h | 2 +- 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/drivers/net/ethernet/amd/pds_core/adminq.c b/drivers/net/ethernet/amd/pds_core/adminq.c index c83a0a80d533..506f682d15c1 100644 --- a/drivers/net/ethernet/amd/pds_core/adminq.c +++ b/drivers/net/ethernet/amd/pds_core/adminq.c @@ -5,11 +5,6 @@ #include "core.h" -struct pdsc_wait_context { - struct pdsc_qcq *qcq; - struct completion wait_completion; -}; - static int pdsc_process_notifyq(struct pdsc_qcq *qcq) { union pds_core_notifyq_comp *comp; @@ -109,10 +104,10 @@ void pdsc_process_adminq(struct pdsc_qcq *qcq) q_info = &q->info[q->tail_idx]; q->tail_idx = (q->tail_idx + 1) & (q->num_descs - 1); - /* Copy out the completion data */ - memcpy(q_info->dest, comp, sizeof(*comp)); - - complete_all(&q_info->wc->wait_completion); + if (!completion_done(&q_info->completion)) { + memcpy(q_info->dest, comp, sizeof(*comp)); + complete(&q_info->completion); + } if (cq->tail_idx == cq->num_descs - 1) cq->done_color = !cq->done_color; @@ -162,8 +157,7 @@ irqreturn_t pdsc_adminq_isr(int irq, void *data) static int __pdsc_adminq_post(struct pdsc *pdsc, struct pdsc_qcq *qcq, union pds_core_adminq_cmd *cmd, - union pds_core_adminq_comp *comp, - struct pdsc_wait_context *wc) + union pds_core_adminq_comp *comp) { struct pdsc_queue *q = &qcq->q; struct pdsc_q_info *q_info; @@ -205,9 +199,9 @@ static int __pdsc_adminq_post(struct pdsc *pdsc, /* Post the request */ index = q->head_idx; q_info = &q->info[index]; - q_info->wc = wc; q_info->dest = comp; memcpy(q_info->desc, cmd, sizeof(*cmd)); + reinit_completion(&q_info->completion); dev_dbg(pdsc->dev, "head_idx %d tail_idx %d\n", q->head_idx, q->tail_idx); @@ -231,16 +225,13 @@ int pdsc_adminq_post(struct pdsc *pdsc, union pds_core_adminq_comp *comp, bool fast_poll) { - struct pdsc_wait_context wc = { - .wait_completion = - COMPLETION_INITIALIZER_ONSTACK(wc.wait_completion), - }; unsigned long poll_interval = 1; unsigned long poll_jiffies; unsigned long time_limit; unsigned long time_start; unsigned long time_done; unsigned long remaining; + struct completion *wc; int err = 0; int index; @@ -250,20 +241,19 @@ int pdsc_adminq_post(struct pdsc *pdsc, return -ENXIO; } - wc.qcq = &pdsc->adminqcq; - index = __pdsc_adminq_post(pdsc, &pdsc->adminqcq, cmd, comp, &wc); + index = __pdsc_adminq_post(pdsc, &pdsc->adminqcq, cmd, comp); if (index < 0) { err = index; goto err_out; } + wc = &pdsc->adminqcq.q.info[index].completion; time_start = jiffies; time_limit = time_start + HZ * pdsc->devcmd_timeout; do { /* Timeslice the actual wait to catch IO errors etc early */ poll_jiffies = msecs_to_jiffies(poll_interval); - remaining = wait_for_completion_timeout(&wc.wait_completion, - poll_jiffies); + remaining = wait_for_completion_timeout(wc, poll_jiffies); if (remaining) break; @@ -292,9 +282,11 @@ int pdsc_adminq_post(struct pdsc *pdsc, dev_dbg(pdsc->dev, "%s: elapsed %d msecs\n", __func__, jiffies_to_msecs(time_done - time_start)); - /* Check the results */ - if (time_after_eq(time_done, time_limit)) + /* Check the results and clear an un-completed timeout */ + if (time_after_eq(time_done, time_limit) && !completion_done(wc)) { err = -ETIMEDOUT; + complete(wc); + } dev_dbg(pdsc->dev, "read admin queue completion idx %d:\n", index); dynamic_hex_dump("comp ", DUMP_PREFIX_OFFSET, 16, 1, diff --git a/drivers/net/ethernet/amd/pds_core/core.c b/drivers/net/ethernet/amd/pds_core/core.c index 55163457f12b..9512aa4083f0 100644 --- a/drivers/net/ethernet/amd/pds_core/core.c +++ b/drivers/net/ethernet/amd/pds_core/core.c @@ -167,8 +167,10 @@ static void pdsc_q_map(struct pdsc_queue *q, void *base, dma_addr_t base_pa) q->base = base; q->base_pa = base_pa; - for (i = 0, cur = q->info; i < q->num_descs; i++, cur++) + for (i = 0, cur = q->info; i < q->num_descs; i++, cur++) { cur->desc = base + (i * q->desc_size); + init_completion(&cur->completion); + } } static void pdsc_cq_map(struct pdsc_cq *cq, void *base, dma_addr_t base_pa) diff --git a/drivers/net/ethernet/amd/pds_core/core.h b/drivers/net/ethernet/amd/pds_core/core.h index 199473112c29..0b53a1fab46d 100644 --- a/drivers/net/ethernet/amd/pds_core/core.h +++ b/drivers/net/ethernet/amd/pds_core/core.h @@ -96,7 +96,7 @@ struct pdsc_q_info { unsigned int bytes; unsigned int nbufs; struct pdsc_buf_info bufs[PDS_CORE_MAX_FRAGS]; - struct pdsc_wait_context *wc; + struct completion completion; void *dest; }; -- cgit v1.2.3 From 607b310ada5ef4c738f9dffc758a62a9d309b084 Mon Sep 17 00:00:00 2001 From: Johannes Schneider Date: Wed, 23 Apr 2025 06:47:24 +0200 Subject: net: dp83822: Fix OF_MDIO config check When CONFIG_OF_MDIO is set to be a module the code block is not compiled. Use the IS_ENABLED macro that checks for both built in as well as module. Fixes: 5dc39fd5ef35 ("net: phy: DP83822: Add ability to advertise Fiber connection") Signed-off-by: Johannes Schneider Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20250423044724.1284492-1-johannes.schneider@leica-geosystems.com Signed-off-by: Paolo Abeni --- drivers/net/phy/dp83822.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/dp83822.c b/drivers/net/phy/dp83822.c index 14f361549638..e32013eb0186 100644 --- a/drivers/net/phy/dp83822.c +++ b/drivers/net/phy/dp83822.c @@ -730,7 +730,7 @@ static int dp83822_phy_reset(struct phy_device *phydev) return phydev->drv->config_init(phydev); } -#ifdef CONFIG_OF_MDIO +#if IS_ENABLED(CONFIG_OF_MDIO) static const u32 tx_amplitude_100base_tx_gain[] = { 80, 82, 83, 85, 87, 88, 90, 92, 93, 95, 97, 98, 100, 102, 103, 105, -- cgit v1.2.3 From 73fa4597bdc035437fbcd84d6be32bd39f1f2149 Mon Sep 17 00:00:00 2001 From: Alexis Lothore Date: Wed, 23 Apr 2025 09:12:09 +0200 Subject: net: stmmac: fix dwmac1000 ptp timestamp status offset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a PTP interrupt occurs, the driver accesses the wrong offset to learn about the number of available snapshots in the FIFO for dwmac1000: it should be accessing bits 29..25, while it is currently reading bits 19..16 (those are bits about the auxiliary triggers which have generated the timestamps). As a consequence, it does not compute correctly the number of available snapshots, and so possibly do not generate the corresponding clock events if the bogus value ends up being 0. Fix clock events generation by reading the correct bits in the timestamp register for dwmac1000. Fixes: 477c3e1f6363 ("net: stmmac: Introduce dwmac1000 timestamping operations") Signed-off-by: Alexis Lothoré Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20250423-stmmac_ts-v2-1-e2cf2bbd61b1@bootlin.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac1000.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h index 967a16212faf..0c011a47d5a3 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h @@ -320,8 +320,8 @@ enum rtc_control { /* PTP and timestamping registers */ -#define GMAC3_X_ATSNS GENMASK(19, 16) -#define GMAC3_X_ATSNS_SHIFT 16 +#define GMAC3_X_ATSNS GENMASK(29, 25) +#define GMAC3_X_ATSNS_SHIFT 25 #define GMAC_PTP_TCR_ATSFC BIT(24) #define GMAC_PTP_TCR_ATSEN0 BIT(25) -- cgit v1.2.3 From 7b7491372f8ec2d8c08da18e5d629e55f41dda89 Mon Sep 17 00:00:00 2001 From: Alexis Lothoré Date: Wed, 23 Apr 2025 09:12:10 +0200 Subject: net: stmmac: fix multiplication overflow when reading timestamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current way of reading a timestamp snapshot in stmmac can lead to integer overflow, as the computation is done on 32 bits. The issue has been observed on a dwmac-socfpga platform returning chaotic timestamp values due to this overflow. The corresponding multiplication is done with a MUL instruction, which returns 32 bit values. Explicitly casting the value to 64 bits replaced the MUL with a UMLAL, which computes and returns the result on 64 bits, and so returns correctly the timestamps. Prevent this overflow by explicitly casting the intermediate value to u64 to make sure that the whole computation is made on u64. While at it, apply the same cast on the other dwmac variant (GMAC4) method for snapshot retrieval. Fixes: 477c3e1f6363 ("net: stmmac: Introduce dwmac1000 timestamping operations") Signed-off-by: Alexis Lothoré Reviewed-by: Maxime Chevallier Link: https://patch.msgid.link/20250423-stmmac_ts-v2-2-e2cf2bbd61b1@bootlin.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c | 2 +- drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c index a8b901cdf5cb..56b76aaa58f0 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c @@ -553,7 +553,7 @@ void dwmac1000_get_ptptime(void __iomem *ptpaddr, u64 *ptp_time) u64 ns; ns = readl(ptpaddr + GMAC_PTP_ATNR); - ns += readl(ptpaddr + GMAC_PTP_ATSR) * NSEC_PER_SEC; + ns += (u64)readl(ptpaddr + GMAC_PTP_ATSR) * NSEC_PER_SEC; *ptp_time = ns; } diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c index 0f59aa982604..e2840fa241f2 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_hwtstamp.c @@ -222,7 +222,7 @@ static void get_ptptime(void __iomem *ptpaddr, u64 *ptp_time) u64 ns; ns = readl(ptpaddr + PTP_ATNR); - ns += readl(ptpaddr + PTP_ATSR) * NSEC_PER_SEC; + ns += (u64)readl(ptpaddr + PTP_ATSR) * NSEC_PER_SEC; *ptp_time = ns; } -- cgit v1.2.3 From 5e16f1a68d28965c12b6fa227a306fef8a680f84 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Thu, 24 Apr 2025 12:28:39 +0100 Subject: io_uring: don't duplicate flushing in io_req_post_cqe io_req_post_cqe() sets submit_state.cq_flush so that *flush_completions() can take care of batch commiting CQEs. Don't commit it twice by using __io_cq_unlock_post(). Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/41c416660c509cee676b6cad96081274bcb459f3.1745493861.git.asml.silence@gmail.com Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index c6209fe44cb1..eedb47c8c79e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -872,10 +872,15 @@ bool io_req_post_cqe(struct io_kiocb *req, s32 res, u32 cflags) lockdep_assert(!io_wq_current_is_worker()); lockdep_assert_held(&ctx->uring_lock); - __io_cq_lock(ctx); - posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + if (!ctx->lockless_cq) { + spin_lock(&ctx->completion_lock); + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + spin_unlock(&ctx->completion_lock); + } else { + posted = io_fill_cqe_aux(ctx, req->cqe.user_data, res, cflags); + } + ctx->submit_state.cq_flush = true; - __io_cq_unlock_post(ctx); return posted; } -- cgit v1.2.3 From 1d019736b6f812bebf3ef89d6e887d06e2a822fc Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Wed, 23 Apr 2025 15:29:03 -0600 Subject: selftests: ublk: common: fix _get_disk_dev_t for pre-9.0 coreutils Some distributions, such as centos stream 9, still have a version of coreutils which does not yet support the %Hr and %Lr formats for stat(1) [1, 2]. Running ublk selftests on these distributions results in the following error in tests that use the _get_disk_dev_t helper: line 23: ?r: syntax error: operand expected (error token is "?r") To better accommodate older distributions, rewrite _get_disk_dev_t to use the much older %t and %T formats for stat instead. [1] https://github.com/coreutils/coreutils/blob/v9.0/NEWS#L114 [2] https://pkgs.org/download/coreutils Signed-off-by: Uday Shankar Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20250423-ublk_selftests-v1-2-7d060e260e76@purestorage.com Signed-off-by: Jens Axboe --- tools/testing/selftests/ublk/test_common.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/ublk/test_common.sh b/tools/testing/selftests/ublk/test_common.sh index 9fc111f64576..a81210ca3e99 100755 --- a/tools/testing/selftests/ublk/test_common.sh +++ b/tools/testing/selftests/ublk/test_common.sh @@ -17,8 +17,8 @@ _get_disk_dev_t() { local minor dev=/dev/ublkb"${dev_id}" - major=$(stat -c '%Hr' "$dev") - minor=$(stat -c '%Lr' "$dev") + major="0x"$(stat -c '%t' "$dev") + minor="0x"$(stat -c '%T' "$dev") echo $(( (major & 0xfff) << 20 | (minor & 0xfffff) )) } -- cgit v1.2.3 From 7b720c720253e2070459420b2628a7b9ee6733b3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 24 Apr 2025 10:25:21 +0200 Subject: block: never reduce ra_pages in blk_apply_bdi_limits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the user increased the read-ahead size through sysfs this value currently get lost if the device is reprobe, including on a resume from suspend. As there is no hardware limitation for the read-ahead size there is no real need to reset it or track a separate hardware limitation like for max_sectors. This restores the pre-atomic queue limit behavior in the sd driver as sd did not use blk_queue_io_opt and thus never updated the read ahead size to the value based of the optimal I/O, but changes behavior for all other drivers. As the new behavior seems useful and sd is the driver for which the readahead size tweaks are most useful that seems like a worthwhile trade off. Fixes: 804e498e0496 ("sd: convert to the atomic queue limits API") Reported-by: Holger Hoffstätte Signed-off-by: Christoph Hellwig Tested-by: Holger Hoffstätte Reviewed-by: Hannes Reinecke Link: https://lore.kernel.org/r/20250424082521.1967286-1-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-settings.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 6b2dbe645d23..4817e7ca03f8 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -61,8 +61,14 @@ void blk_apply_bdi_limits(struct backing_dev_info *bdi, /* * For read-ahead of large files to be effective, we need to read ahead * at least twice the optimal I/O size. + * + * There is no hardware limitation for the read-ahead size and the user + * might have increased the read-ahead size through sysfs, so don't ever + * decrease it. */ - bdi->ra_pages = max(lim->io_opt * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + bdi->ra_pages = max3(bdi->ra_pages, + lim->io_opt * 2 / PAGE_SIZE, + VM_READAHEAD_PAGES); bdi->io_pages = lim->max_sectors >> PAGE_SECTORS_SHIFT; } -- cgit v1.2.3 From c63202140d4b411d27380805c4d68eb11407b7f2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Apr 2025 07:37:39 +0200 Subject: block: move blkdev_{get,put} _no_open prototypes out of blkdev.h These are only to be used by block internal code. Remove the comment as we grew more users due to reworking block device node opening. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250423053810.1683309-2-hch@lst.de Signed-off-by: Jens Axboe --- block/blk.h | 3 +++ include/linux/blkdev.h | 4 ---- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/block/blk.h b/block/blk.h index 006e3be433d2..939e5e1aec9c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -94,6 +94,9 @@ static inline void blk_wait_io(struct completion *done) wait_for_completion_io(done); } +struct block_device *blkdev_get_no_open(dev_t dev); +void blkdev_put_no_open(struct block_device *bdev); + #define BIO_INLINE_VECS 4 struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c6b478cfb32d..14abaeedaa88 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1671,10 +1671,6 @@ int bd_prepare_to_claim(struct block_device *bdev, void *holder, const struct blk_holder_ops *hops); void bd_abort_claiming(struct block_device *bdev, void *holder); -/* just for blk-cgroup, don't use elsewhere */ -struct block_device *blkdev_get_no_open(dev_t dev); -void blkdev_put_no_open(struct block_device *bdev); - struct block_device *I_BDEV(struct inode *inode); struct block_device *file_bdev(struct file *bdev_file); bool disk_live(struct gendisk *disk); -- cgit v1.2.3 From d13b7090b2510abaa83a25717466decca23e8226 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Apr 2025 07:37:40 +0200 Subject: block: remove the backing_inode variable in bdev_statx backing_inode is only used once, so remove it and update the comment describing the bdev lookup to be a bit more clear. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250423053810.1683309-3-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index e29b3b6d1707..b57c2c0853fc 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -1313,18 +1313,15 @@ void sync_bdevs(bool wait) void bdev_statx(struct path *path, struct kstat *stat, u32 request_mask) { - struct inode *backing_inode; struct block_device *bdev; - backing_inode = d_backing_inode(path->dentry); - /* - * Note that backing_inode is the inode of a block device node file, - * not the block device's internal inode. Therefore it is *not* valid - * to use I_BDEV() here; the block device has to be looked up by i_rdev + * Note that d_backing_inode() returns the block device node inode, not + * the block device's internal inode. Therefore it is *not* valid to + * use I_BDEV() here; the block device has to be looked up by i_rdev * instead. */ - bdev = blkdev_get_no_open(backing_inode->i_rdev); + bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev); if (!bdev) return; -- cgit v1.2.3 From 5f33b5226c9d92359e58e91ad0bf0c1791da36a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Apr 2025 07:37:41 +0200 Subject: block: don't autoload drivers on stat blkdev_get_no_open can trigger the legacy autoload of block drivers. A simple stat of a block device has not historically done that, so disable this behavior again. Fixes: 9abcfbd235f5 ("block: Add atomic write support for statx") Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250423053810.1683309-4-hch@lst.de Signed-off-by: Jens Axboe --- block/bdev.c | 8 ++++---- block/blk-cgroup.c | 2 +- block/blk.h | 2 +- block/fops.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/block/bdev.c b/block/bdev.c index b57c2c0853fc..520515e4e64e 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -815,13 +815,13 @@ static void blkdev_put_part(struct block_device *part) blkdev_put_whole(whole); } -struct block_device *blkdev_get_no_open(dev_t dev) +struct block_device *blkdev_get_no_open(dev_t dev, bool autoload) { struct block_device *bdev; struct inode *inode; inode = ilookup(blockdev_superblock, dev); - if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { + if (!inode && autoload && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { blk_request_module(dev); inode = ilookup(blockdev_superblock, dev); if (inode) @@ -1043,7 +1043,7 @@ struct file *bdev_file_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, if (ret) return ERR_PTR(ret); - bdev = blkdev_get_no_open(dev); + bdev = blkdev_get_no_open(dev, true); if (!bdev) return ERR_PTR(-ENXIO); @@ -1321,7 +1321,7 @@ void bdev_statx(struct path *path, struct kstat *stat, * use I_BDEV() here; the block device has to be looked up by i_rdev * instead. */ - bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev); + bdev = blkdev_get_no_open(d_backing_inode(path->dentry)->i_rdev, false); if (!bdev) return; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 5905f277057b..e172aeda4183 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -797,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) return -EINVAL; input = skip_spaces(input); - bdev = blkdev_get_no_open(MKDEV(major, minor)); + bdev = blkdev_get_no_open(MKDEV(major, minor), true); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { diff --git a/block/blk.h b/block/blk.h index 939e5e1aec9c..328075787814 100644 --- a/block/blk.h +++ b/block/blk.h @@ -94,7 +94,7 @@ static inline void blk_wait_io(struct completion *done) wait_for_completion_io(done); } -struct block_device *blkdev_get_no_open(dev_t dev); +struct block_device *blkdev_get_no_open(dev_t dev, bool autoload); void blkdev_put_no_open(struct block_device *bdev); #define BIO_INLINE_VECS 4 diff --git a/block/fops.c b/block/fops.c index e221fdcaa8aa..82b672d15ea4 100644 --- a/block/fops.c +++ b/block/fops.c @@ -642,7 +642,7 @@ static int blkdev_open(struct inode *inode, struct file *filp) if (ret) return ret; - bdev = blkdev_get_no_open(inode->i_rdev); + bdev = blkdev_get_no_open(inode->i_rdev, true); if (!bdev) return -ENXIO; -- cgit v1.2.3 From c4d2519c6ad854dc2114e77d693b3cf1baf55330 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Apr 2025 07:37:42 +0200 Subject: block: don't autoload drivers on blk-cgroup configuration Loading a driver just to configure blk-cgroup doesn't make sense, as that assumes and already existing device. Signed-off-by: Christoph Hellwig Reviewed-by: Christian Brauner Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20250423053810.1683309-5-hch@lst.de Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index e172aeda4183..ce93706555c5 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -797,7 +797,7 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) return -EINVAL; input = skip_spaces(input); - bdev = blkdev_get_no_open(MKDEV(major, minor), true); + bdev = blkdev_get_no_open(MKDEV(major, minor), false); if (!bdev) return -ENODEV; if (bdev_is_partition(bdev)) { -- cgit v1.2.3 From 5f9e1698141a724ef63f75ee22fa9007d97de5bb Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Fri, 18 Apr 2025 13:03:08 -0400 Subject: KVM: arm64, x86: make kvm_arch_has_irq_bypass() inline kvm_arch_has_irq_bypass() is a small function and even though it does not appear in any *really* hot paths, it's also not entirely rare. Make it inline---it also works out nicely in preparation for using it in kvm-intel.ko and kvm-amd.ko, since the function is not currently exported. Suggested-by: Linus Torvalds Signed-off-by: Paolo Bonzini --- arch/arm64/include/asm/kvm_host.h | 5 +++++ arch/arm64/kvm/arm.c | 5 ----- arch/x86/include/asm/kvm_host.h | 6 ++++++ arch/x86/kvm/x86.c | 5 ----- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index e98cfe7855a6..08ba91e6fb03 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -1588,4 +1588,9 @@ void kvm_set_vm_id_reg(struct kvm *kvm, u32 reg, u64 val); #define kvm_has_s1poe(k) \ (kvm_has_feat((k), ID_AA64MMFR3_EL1, S1POE, IMP)) +static inline bool kvm_arch_has_irq_bypass(void) +{ + return true; +} + #endif /* __ARM64_KVM_HOST_H__ */ diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 68fec8c95fee..19ca57def629 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -2743,11 +2743,6 @@ bool kvm_arch_irqchip_in_kernel(struct kvm *kvm) return irqchip_in_kernel(kvm); } -bool kvm_arch_has_irq_bypass(void) -{ - return true; -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 3bdae454a959..7bc174a1f1cb 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -2423,4 +2424,9 @@ int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages); */ #define KVM_EXIT_HYPERCALL_MBZ GENMASK_ULL(31, 1) +static inline bool kvm_arch_has_irq_bypass(void) +{ + return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); +} + #endif /* _ASM_X86_KVM_HOST_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 3712dde0bf9d..c1fdd527044c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13556,11 +13556,6 @@ bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); -bool kvm_arch_has_irq_bypass(void) -{ - return enable_apicv && irq_remapping_cap(IRQ_POSTING_CAP); -} - int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, struct irq_bypass_producer *prod) { -- cgit v1.2.3 From 6560aff981ada9aec8163509a59d8f48283263d6 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 1 Apr 2025 09:18:03 -0700 Subject: KVM: SVM: Don't update IRTEs if APICv/AVIC is disabled Skip IRTE updates if AVIC is disabled/unsupported, as forcing the IRTE into remapped mode (kvm_vcpu_apicv_active() will never be true) is unnecessary and wasteful. The IOMMU driver is responsible for putting IRTEs into remapped mode when an IRQ is allocated by a device, long before that device is assigned to a VM. I.e. the kernel as a whole has major issues if the IRTE isn't already in remapped mode. Opportunsitically kvm_arch_has_irq_bypass() to query for APICv/AVIC, so so that all checks in KVM x86 incorporate the same information. Cc: Yosry Ahmed Cc: Jim Mattson Signed-off-by: Sean Christopherson Message-ID: <20250401161804.842968-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 65fd245a9953..901d8d2dc169 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -898,8 +898,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_irq_routing_table *irq_rt; int idx, ret = 0; - if (!kvm_arch_has_assigned_device(kvm) || - !irq_remapping_cap(IRQ_POSTING_CAP)) + if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) return 0; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", -- cgit v1.2.3 From 7537deda36521fa8fff9133b39c46e31893606f2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:16 -0700 Subject: KVM: SVM: Allocate IR data using atomic allocation Allocate SVM's interrupt remapping metadata using GFP_ATOMIC as svm_ir_list_add() is called with IRQs are disabled and irqfs.lock held when kvm_irq_routing_update() reacts to GSI routing changes. Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-2-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 901d8d2dc169..a961e6e67050 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -820,7 +820,7 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) * Allocating new amd_iommu_pi_data, which will get * add to the per-vcpu ir_list. */ - ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); + ir = kzalloc(sizeof(struct amd_svm_iommu_ir), GFP_ATOMIC | __GFP_ACCOUNT); if (!ir) { ret = -ENOMEM; goto out; -- cgit v1.2.3 From 9bcac97dc42d2f4da8229d18feb0fe2b1ce523a2 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:17 -0700 Subject: KVM: x86: Reset IRTE to host control if *new* route isn't postable Restore an IRTE back to host control (remapped or posted MSI mode) if the *new* GSI route prevents posting the IRQ directly to a vCPU, regardless of the GSI routing type. Updating the IRTE if and only if the new GSI is an MSI results in KVM leaving an IRTE posting to a vCPU. The dangling IRTE can result in interrupts being incorrectly delivered to the guest, and in the worst case scenario can result in use-after-free, e.g. if the VM is torn down, but the underlying host IRQ isn't freed. Fixes: efc644048ecd ("KVM: x86: Update IRTE for posted-interrupts") Fixes: 411b44ba80ab ("svm: Implements update_pi_irte hook to setup posted interrupt") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-3-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 58 ++++++++++++++++++++++-------------------- arch/x86/kvm/vmx/posted_intr.c | 28 ++++++++------------ 2 files changed, 41 insertions(+), 45 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a961e6e67050..8e09f6ae98fd 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -896,6 +896,7 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !kvm_arch_has_irq_bypass()) @@ -932,6 +933,8 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_vcpu_apicv_active(&svm->vcpu)) { struct amd_iommu_pi_data pi; + enable_remapped_mode = false; + /* Try to enable guest_mode in IRTE */ pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & AVIC_HPA_MASK); @@ -950,33 +953,6 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, */ if (!ret && pi.is_guest_mode) svm_ir_list_add(svm, &pi); - } else { - /* Use legacy mode in IRTE */ - struct amd_iommu_pi_data pi; - - /** - * Here, pi is used to: - * - Tell IOMMU to use legacy mode for this interrupt. - * - Retrieve ga_tag of prior interrupt remapping data. - */ - pi.prev_ga_tag = 0; - pi.is_guest_mode = false; - ret = irq_set_vcpu_affinity(host_irq, &pi); - - /** - * Check if the posted interrupt was previously - * setup with the guest_mode by checking if the ga_tag - * was cached. If so, we need to clean up the per-vcpu - * ir_list. - */ - if (!ret && pi.prev_ga_tag) { - int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); - struct kvm_vcpu *vcpu; - - vcpu = kvm_get_vcpu_by_id(kvm, id); - if (vcpu) - svm_ir_list_del(to_svm(vcpu), &pi); - } } if (!ret && svm) { @@ -992,6 +968,34 @@ int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } ret = 0; + if (enable_remapped_mode) { + /* Use legacy mode in IRTE */ + struct amd_iommu_pi_data pi; + + /** + * Here, pi is used to: + * - Tell IOMMU to use legacy mode for this interrupt. + * - Retrieve ga_tag of prior interrupt remapping data. + */ + pi.prev_ga_tag = 0; + pi.is_guest_mode = false; + ret = irq_set_vcpu_affinity(host_irq, &pi); + + /** + * Check if the posted interrupt was previously + * setup with the guest_mode by checking if the ga_tag + * was cached. If so, we need to clean up the per-vcpu + * ir_list. + */ + if (!ret && pi.prev_ga_tag) { + int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); + struct kvm_vcpu *vcpu; + + vcpu = kvm_get_vcpu_by_id(kvm, id); + if (vcpu) + svm_ir_list_del(to_svm(vcpu), &pi); + } + } out: srcu_read_unlock(&kvm->irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/vmx/posted_intr.c b/arch/x86/kvm/vmx/posted_intr.c index 51116fe69a50..d70e5b90087d 100644 --- a/arch/x86/kvm/vmx/posted_intr.c +++ b/arch/x86/kvm/vmx/posted_intr.c @@ -297,6 +297,7 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, { struct kvm_kernel_irq_routing_entry *e; struct kvm_irq_routing_table *irq_rt; + bool enable_remapped_mode = true; struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; @@ -335,21 +336,8 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, kvm_set_msi_irq(kvm, e, &irq); if (!kvm_intr_is_single_vcpu(kvm, &irq, &vcpu) || - !kvm_irq_is_postable(&irq)) { - /* - * Make sure the IRTE is in remapped mode if - * we don't handle it in posted mode. - */ - ret = irq_set_vcpu_affinity(host_irq, NULL); - if (ret < 0) { - printk(KERN_INFO - "failed to back to remapped mode, irq: %u\n", - host_irq); - goto out; - } - + !kvm_irq_is_postable(&irq)) continue; - } vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); vcpu_info.vector = irq.vector; @@ -357,11 +345,12 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, vcpu_info.vector, vcpu_info.pi_desc_addr, set); - if (set) - ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else - ret = irq_set_vcpu_affinity(host_irq, NULL); + if (!set) + continue; + enable_remapped_mode = false; + + ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", __func__); @@ -369,6 +358,9 @@ int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, } } + if (enable_remapped_mode) + ret = irq_set_vcpu_affinity(host_irq, NULL); + ret = 0; out: srcu_read_unlock(&kvm->irq_srcu, idx); -- cgit v1.2.3 From bcda70c56f3e718465cab2aad260cf34183ce1ce Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:18 -0700 Subject: KVM: x86: Explicitly treat routing entry type changes as changes Explicitly treat type differences as GSI routing changes, as comparing MSI data between two entries could get a false negative, e.g. if userspace changed the type but left the type-specific data as-is. Fixes: 515a0c79e796 ("kvm: irqfd: avoid update unmodified entries of the routing") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-4-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1fdd527044c..9c98b77b7dc1 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13607,7 +13607,8 @@ int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, struct kvm_kernel_irq_routing_entry *new) { - if (new->type != KVM_IRQ_ROUTING_MSI) + if (old->type != KVM_IRQ_ROUTING_MSI || + new->type != KVM_IRQ_ROUTING_MSI) return true; return !!memcmp(&old->msi, &new->msi, sizeof(new->msi)); -- cgit v1.2.3 From f1fb088d9cecde5c3066d8ff8846789667519b7d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:19 -0700 Subject: KVM: x86: Take irqfds.lock when adding/deleting IRQ bypass producer Take irqfds.lock when adding/deleting an IRQ bypass producer to ensure irqfd->producer isn't modified while kvm_irq_routing_update() is running. The only lock held when a producer is added/removed is irqbypass's mutex. Fixes: 872768800652 ("KVM: x86: select IRQ_BYPASS_MANAGER") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-5-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 9c98b77b7dc1..a6829a370e6a 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -13561,15 +13561,22 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, { struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); + + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = prod; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + spin_unlock_irq(&kvm->irqfds.lock); + + return ret; } @@ -13579,9 +13586,9 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, int ret; struct kvm_kernel_irqfd *irqfd = container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; WARN_ON(irqfd->producer != prod); - irqfd->producer = NULL; /* * When producer of consumer is unregistered, we change back to @@ -13589,12 +13596,18 @@ void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, * when the irq is masked/disabled or the consumer side (KVM * int this case doesn't want to receive the interrupts. */ + spin_lock_irq(&kvm->irqfds.lock); + irqfd->producer = NULL; + ret = kvm_x86_call(pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); if (ret) printk(KERN_INFO "irq bypass consumer (token %p) unregistration" " fails: %d\n", irqfd->consumer.token, ret); + spin_unlock_irq(&kvm->irqfds.lock); + + kvm_arch_end_assignment(irqfd->kvm); } -- cgit v1.2.3 From 07172206a26dcf3f0bf7c3ecaadd4242b008ea54 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:20 -0700 Subject: iommu/amd: Return an error if vCPU affinity is set for non-vCPU IRTE Return -EINVAL instead of success if amd_ir_set_vcpu_affinity() is invoked without use_vapic; lying to KVM about whether or not the IRTE was configured to post IRQs is all kinds of bad. Fixes: d98de49a53e4 ("iommu/amd: Enable vAPIC interrupt remapping mode by default") Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-6-seanjc@google.com> Signed-off-by: Paolo Bonzini --- drivers/iommu/amd/iommu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index be8761bbef0f..00965518b802 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3879,7 +3879,7 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) * we should not modify the IRTE */ if (!dev_data || !dev_data->use_vapic) - return 0; + return -EINVAL; ir_data->cfg = irqd_cfg(data); pi_data->ir_data = ir_data; -- cgit v1.2.3 From aae251a380fe4741594368e0d7836a082b17ae3e Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:21 -0700 Subject: iommu/amd: WARN if KVM attempts to set vCPU affinity without posted intrrupts WARN if KVM attempts to set vCPU affinity when posted interrupts aren't enabled, as KVM shouldn't try to enable posting when they're unsupported, and the IOMMU driver darn well should only advertise posting support when AMD_IOMMU_GUEST_IR_VAPIC() is true. Note, KVM consumes is_guest_mode only on success. Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-7-seanjc@google.com> Signed-off-by: Paolo Bonzini --- drivers/iommu/amd/iommu.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/drivers/iommu/amd/iommu.c b/drivers/iommu/amd/iommu.c index 00965518b802..f34209b08b4c 100644 --- a/drivers/iommu/amd/iommu.c +++ b/drivers/iommu/amd/iommu.c @@ -3869,6 +3869,9 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) struct irq_2_irte *irte_info = &ir_data->irq_2_irte; struct iommu_dev_data *dev_data; + if (WARN_ON_ONCE(!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir))) + return -EINVAL; + if (ir_data->iommu == NULL) return -EINVAL; @@ -3884,16 +3887,6 @@ static int amd_ir_set_vcpu_affinity(struct irq_data *data, void *vcpu_info) ir_data->cfg = irqd_cfg(data); pi_data->ir_data = ir_data; - /* Note: - * SVM tries to set up for VAPIC mode, but we are in - * legacy mode. So, we force legacy mode instead. - */ - if (!AMD_IOMMU_GUEST_IR_VAPIC(amd_iommu_guest_ir)) { - pr_debug("%s: Fall back to using intr legacy remap\n", - __func__); - pi_data->is_guest_mode = false; - } - pi_data->prev_ga_tag = ir_data->cached_ga_tag; if (pi_data->is_guest_mode) { ir_data->ga_root_ptr = (pi_data->base >> 12); -- cgit v1.2.3 From 268cbfe65bb9096f78f98d1e092b1939d3caa382 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 4 Apr 2025 12:38:22 -0700 Subject: KVM: SVM: WARN if an invalid posted interrupt IRTE entry is added Now that the AMD IOMMU doesn't signal success incorrectly, WARN if KVM attempts to track an AMD IRTE entry without metadata. Signed-off-by: Sean Christopherson Message-ID: <20250404193923.1413163-8-seanjc@google.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/avic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8e09f6ae98fd..7338879d1c0c 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -796,12 +796,15 @@ static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) struct amd_svm_iommu_ir *ir; u64 entry; + if (WARN_ON_ONCE(!pi->ir_data)) + return -EINVAL; + /** * In some cases, the existing irte is updated and re-set, * so we need to check here if it's already been * added * to the ir_list. */ - if (pi->ir_data && (pi->prev_ga_tag != 0)) { + if (pi->prev_ga_tag) { struct kvm *kvm = svm->vcpu.kvm; u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, vcpu_id); -- cgit v1.2.3 From ca4f113b0b4c2de6ffb438d5d0ebb7337877c911 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 15 Apr 2025 13:48:20 +0300 Subject: KVM: x86: Do not use kvm_rip_read() unconditionally in KVM tracepoints Not all VMs allow access to RIP. Check guest_state_protected before calling kvm_rip_read(). This avoids, for example, hitting WARN_ON_ONCE in vt_cache_reg() for TDX VMs. Fixes: 81bf912b2c15 ("KVM: TDX: Implement TDX vcpu enter/exit path") Signed-off-by: Adrian Hunter Message-ID: <20250415104821.247234-2-adrian.hunter@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/trace.h | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index ccda95e53f62..ba736cbb0587 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -11,6 +11,13 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm +#ifdef CREATE_TRACE_POINTS +#define tracing_kvm_rip_read(vcpu) ({ \ + typeof(vcpu) __vcpu = vcpu; \ + __vcpu->arch.guest_state_protected ? 0 : kvm_rip_read(__vcpu); \ + }) +#endif + /* * Tracepoint for guest mode entry. */ @@ -28,7 +35,7 @@ TRACE_EVENT(kvm_entry, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->rip = kvm_rip_read(vcpu); + __entry->rip = tracing_kvm_rip_read(vcpu); __entry->immediate_exit = force_immediate_exit; kvm_x86_call(get_entry_info)(vcpu, &__entry->intr_info, @@ -319,7 +326,7 @@ TRACE_EVENT(name, \ ), \ \ TP_fast_assign( \ - __entry->guest_rip = kvm_rip_read(vcpu); \ + __entry->guest_rip = tracing_kvm_rip_read(vcpu); \ __entry->isa = isa; \ __entry->vcpu_id = vcpu->vcpu_id; \ __entry->requests = READ_ONCE(vcpu->requests); \ @@ -423,7 +430,7 @@ TRACE_EVENT(kvm_page_fault, TP_fast_assign( __entry->vcpu_id = vcpu->vcpu_id; - __entry->guest_rip = kvm_rip_read(vcpu); + __entry->guest_rip = tracing_kvm_rip_read(vcpu); __entry->fault_address = fault_address; __entry->error_code = error_code; ), -- cgit v1.2.3 From 38e93267ca6807fc34288ce1a9c610bf219fc0e0 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 15 Apr 2025 13:48:21 +0300 Subject: KVM: x86: Do not use kvm_rip_read() unconditionally for KVM_PROFILING Not all VMs allow access to RIP. Check guest_state_protected before calling kvm_rip_read(). This avoids, for example, hitting WARN_ON_ONCE in vt_cache_reg() for TDX VMs. Fixes: 81bf912b2c15 ("KVM: TDX: Implement TDX vcpu enter/exit path") Signed-off-by: Adrian Hunter Message-ID: <20250415104821.247234-3-adrian.hunter@intel.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index a6829a370e6a..df5b99ea1f18 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -11098,7 +11098,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) /* * Profile KVM exit RIPs: */ - if (unlikely(prof_on == KVM_PROFILING)) { + if (unlikely(prof_on == KVM_PROFILING && + !vcpu->arch.guest_state_protected)) { unsigned long rip = kvm_rip_read(vcpu); profile_hit(KVM_PROFILING, (void *)rip); } -- cgit v1.2.3 From 032ce1ea9442e140a80e41078b5431d4c0fa2893 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Thu, 24 Apr 2025 12:19:18 +0200 Subject: x86/boot: Work around broken busybox 'truncate' tool The GNU coreutils version of truncate, which is the original, accepts a % prefix for the -s size argument which means the file in question should be padded to a multiple of the given size. This is currently used to pad the setup block of bzImage to a multiple of 4k before appending the decompressor. busybox reimplements truncate but does not support this idiom, and therefore fails the build since commit 9c54baab4401 ("x86/boot: Drop CRC-32 checksum and the build tool that generates it") Since very little build code within the kernel depends on the 'truncate' utility, work around this incompatibility by avoiding truncate altogether, and relying on dd to perform the padding. Fixes: 9c54baab4401 ("x86/boot: Drop CRC-32 checksum and the build tool that generates it") Reported-by: Tested-by: Philipp Stanner Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Link: https://lore.kernel.org/r/20250424101917.1552527-2-ardb+git@google.com --- arch/x86/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 81f55da81967..640fcac3af74 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -59,7 +59,7 @@ KBUILD_CFLAGS += $(CONFIG_CC_IMPLICIT_FALLTHROUGH) $(obj)/bzImage: asflags-y := $(SVGA_MODE) quiet_cmd_image = BUILD $@ - cmd_image = cp $< $@; truncate -s %4K $@; cat $(obj)/vmlinux.bin >>$@ + cmd_image = (dd if=$< bs=4k conv=sync status=none; cat $(filter-out $<,$(real-prereqs))) >$@ $(obj)/bzImage: $(obj)/setup.bin $(obj)/vmlinux.bin FORCE $(call if_changed,image) -- cgit v1.2.3 From edd43f4d6f50ec3de55a0c9e9df6348d1da51965 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 24 Apr 2025 10:28:14 -0600 Subject: io_uring: fix 'sync' handling of io_fallback_tw() A previous commit added a 'sync' parameter to io_fallback_tw(), which if true, means the caller wants to wait on the fallback thread handling it. But the logic is somewhat messed up, ensure that ctxs are swapped and flushed appropriately. Cc: stable@vger.kernel.org Fixes: dfbe5561ae93 ("io_uring: flush offloaded and delayed task_work on exit") Signed-off-by: Jens Axboe --- io_uring/io_uring.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index eedb47c8c79e..a2b256e96d5d 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1083,21 +1083,22 @@ static __cold void __io_fallback_tw(struct llist_node *node, bool sync) while (node) { req = container_of(node, struct io_kiocb, io_task_work.node); node = node->next; - if (sync && last_ctx != req->ctx) { + if (last_ctx != req->ctx) { if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } last_ctx = req->ctx; percpu_ref_get(&last_ctx->refs); } - if (llist_add(&req->io_task_work.node, - &req->ctx->fallback_llist)) - schedule_delayed_work(&req->ctx->fallback_work, 1); + if (llist_add(&req->io_task_work.node, &last_ctx->fallback_llist)) + schedule_delayed_work(&last_ctx->fallback_work, 1); } if (last_ctx) { - flush_delayed_work(&last_ctx->fallback_work); + if (sync) + flush_delayed_work(&last_ctx->fallback_work); percpu_ref_put(&last_ctx->refs); } } -- cgit v1.2.3 From 1a97fea9db9e9b9c4839d4232dde9f505ff5b4cc Mon Sep 17 00:00:00 2001 From: Luo Gengkun Date: Wed, 23 Apr 2025 06:47:24 +0000 Subject: perf/x86: Fix non-sampling (counting) events on certain x86 platforms Perf doesn't work at perf stat for hardware events on certain x86 platforms: $perf stat -- sleep 1 Performance counter stats for 'sleep 1': 16.44 msec task-clock # 0.016 CPUs utilized 2 context-switches # 121.691 /sec 0 cpu-migrations # 0.000 /sec 54 page-faults # 3.286 K/sec cycles instructions branches branch-misses The reason is that the check in x86_pmu_hw_config() for sampling events is unexpectedly applied to counting events as well. It should only impact x86 platforms with limit_period used for non-PEBS events. For Intel platforms, it should only impact some older platforms, e.g., HSW, BDW and NHM. Fixes: 88ec7eedbbd2 ("perf/x86: Fix low freqency setting issue") Signed-off-by: Luo Gengkun Signed-off-by: Ingo Molnar Reviewed-by: Kan Liang Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Link: https://lore.kernel.org/r/20250423064724.3716211-1-luogengkun@huaweicloud.com --- arch/x86/events/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 6866cc5acb0b..3a4f031d2f44 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -629,7 +629,7 @@ int x86_pmu_hw_config(struct perf_event *event) if (event->attr.type == event->pmu->type) event->hw.config |= x86_pmu_get_event_config(event); - if (!event->attr.freq && x86_pmu.limit_period) { + if (is_sampling_event(event) && !event->attr.freq && x86_pmu.limit_period) { s64 left = event->attr.sample_period; x86_pmu.limit_period(event, &left); if (left > event->attr.sample_period) -- cgit v1.2.3 From 85fd85bc025a525354acb2241beb3c5387c551ec Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 23 Apr 2025 09:58:15 +0300 Subject: x86/insn: Fix CTEST instruction decoding insn_decoder_test found a problem with decoding APX CTEST instructions: Found an x86 instruction decoder bug, please report this. ffffffff810021df 62 54 94 05 85 ff ctestneq objdump says 6 bytes, but insn_get_length() says 5 It happens because x86-opcode-map.txt doesn't specify arguments for the instruction and the decoder doesn't expect to see ModRM byte. Fixes: 690ca3a3067f ("x86/insn: Add support for APX EVEX instructions to the opcode map") Signed-off-by: Kirill A. Shutemov Signed-off-by: Ingo Molnar Cc: H. Peter Anvin Cc: Adrian Hunter Cc: stable@vger.kernel.org # v6.10+ Link: https://lore.kernel.org/r/20250423065815.2003231-1-kirill.shutemov@linux.intel.com --- arch/x86/lib/x86-opcode-map.txt | 4 ++-- tools/arch/x86/lib/x86-opcode-map.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index caedb3ef6688..f5dd84eb55dc 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -996,8 +996,8 @@ AVXcode: 4 83: Grp1 Ev,Ib (1A),(es) # CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, # CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ -84: CTESTSCC (ev) -85: CTESTSCC (es) | CTESTSCC (66),(es) +84: CTESTSCC Eb,Gb (ev) +85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es) 88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) 8f: POP2 Bq,Rq (000),(11B),(ev) a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) diff --git a/tools/arch/x86/lib/x86-opcode-map.txt b/tools/arch/x86/lib/x86-opcode-map.txt index caedb3ef6688..f5dd84eb55dc 100644 --- a/tools/arch/x86/lib/x86-opcode-map.txt +++ b/tools/arch/x86/lib/x86-opcode-map.txt @@ -996,8 +996,8 @@ AVXcode: 4 83: Grp1 Ev,Ib (1A),(es) # CTESTSCC instructions are: CTESTB, CTESTBE, CTESTF, CTESTL, CTESTLE, CTESTNB, CTESTNBE, CTESTNL, # CTESTNLE, CTESTNO, CTESTNS, CTESTNZ, CTESTO, CTESTS, CTESTT, CTESTZ -84: CTESTSCC (ev) -85: CTESTSCC (es) | CTESTSCC (66),(es) +84: CTESTSCC Eb,Gb (ev) +85: CTESTSCC Ev,Gv (es) | CTESTSCC Ev,Gv (66),(es) 88: POPCNT Gv,Ev (es) | POPCNT Gv,Ev (66),(es) 8f: POP2 Bq,Rq (000),(11B),(ev) a5: SHLD Ev,Gv,CL (es) | SHLD Ev,Gv,CL (66),(es) -- cgit v1.2.3 From 121f34341d396b666d8a90b24768b40e08ca0d61 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Sat, 19 Apr 2025 13:13:59 +0200 Subject: riscv: Replace function-like macro by static inline function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The flush_icache_range() function is implemented as a "function-like macro with unused parameters", which can result in "unused variables" warnings. Replace the macro with a static inline function, as advised by Documentation/process/coding-style.rst. Fixes: 08f051eda33b ("RISC-V: Flush I$ when making a dirty page executable") Signed-off-by: Björn Töpel Link: https://lore.kernel.org/r/20250419111402.1660267-1-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/include/asm/cacheflush.h | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index 8de73f91bfa3..b59ffeb668d6 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -34,11 +34,6 @@ static inline void flush_dcache_page(struct page *page) flush_dcache_folio(page_folio(page)); } -/* - * RISC-V doesn't have an instruction to flush parts of the instruction cache, - * so instead we just flush the whole thing. - */ -#define flush_icache_range(start, end) flush_icache_all() #define flush_icache_user_page(vma, pg, addr, len) \ do { \ if (vma->vm_flags & VM_EXEC) \ @@ -78,6 +73,16 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #endif /* CONFIG_SMP */ +/* + * RISC-V doesn't have an instruction to flush parts of the instruction cache, + * so instead we just flush the whole thing. + */ +#define flush_icache_range flush_icache_range +static inline void flush_icache_range(unsigned long start, unsigned long end) +{ + flush_icache_all(); +} + extern unsigned int riscv_cbom_block_size; extern unsigned int riscv_cboz_block_size; void riscv_init_cbo_blocksizes(void); -- cgit v1.2.3 From 7d1d19a11cfbfd8bae1d89cc010b2cc397cd0c48 Mon Sep 17 00:00:00 2001 From: Björn Töpel Date: Sat, 19 Apr 2025 13:14:00 +0200 Subject: riscv: uprobes: Add missing fence.i after building the XOL buffer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XOL (execute out-of-line) buffer is used to single-step the replaced instruction(s) for uprobes. The RISC-V port was missing a proper fence.i (i$ flushing) after constructing the XOL buffer, which can result in incorrect execution of stale/broken instructions. This was found running the BPF selftests "test_progs: uprobe_autoattach, attach_probe" on the Spacemit K1/X60, where the uprobes tests randomly blew up. Reviewed-by: Guo Ren Fixes: 74784081aac8 ("riscv: Add uprobes supported") Signed-off-by: Björn Töpel Link: https://lore.kernel.org/r/20250419111402.1660267-2-bjorn@kernel.org Signed-off-by: Palmer Dabbelt --- arch/riscv/kernel/probes/uprobes.c | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/riscv/kernel/probes/uprobes.c b/arch/riscv/kernel/probes/uprobes.c index 4b3dc8beaf77..cc15f7ca6cc1 100644 --- a/arch/riscv/kernel/probes/uprobes.c +++ b/arch/riscv/kernel/probes/uprobes.c @@ -167,6 +167,7 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, /* Initialize the slot */ void *kaddr = kmap_atomic(page); void *dst = kaddr + (vaddr & ~PAGE_MASK); + unsigned long start = (unsigned long)dst; memcpy(dst, src, len); @@ -176,13 +177,6 @@ void arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, *(uprobe_opcode_t *)dst = __BUG_INSN_32; } + flush_icache_range(start, start + len); kunmap_atomic(kaddr); - - /* - * We probably need flush_icache_user_page() but it needs vma. - * This should work on most of architectures by default. If - * architecture needs to do something different it can define - * its own version of the function. - */ - flush_dcache_page(page); } -- cgit v1.2.3 From b9e1f873d2e152b1c82dfc50943f4d3ca322f800 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sun, 20 Apr 2025 12:15:15 -0400 Subject: bcachefs: Casefold is now a regular opts.h option Signed-off-by: Kent Overstreet --- fs/bcachefs/bcachefs_format.h | 1 + fs/bcachefs/fs.c | 33 ++++++++++++++++++++++----------- fs/bcachefs/inode.h | 8 ++++++++ fs/bcachefs/inode_format.h | 9 +++++---- fs/bcachefs/namei.c | 4 ---- fs/bcachefs/opts.h | 5 +++++ fs/bcachefs/str_hash.h | 5 ++--- 7 files changed, 43 insertions(+), 22 deletions(-) diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index 377f04d92a14..d6e4a496f02b 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -867,6 +867,7 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index e220e3cba29c..7c1943c83bf6 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -53,16 +53,19 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_subvolume *); /* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) +static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) { static const __maybe_unused unsigned bch_flags_to_vfs[] = { [__BCH_INODE_sync] = S_SYNC, [__BCH_INODE_immutable] = S_IMMUTABLE, [__BCH_INODE_append] = S_APPEND, [__BCH_INODE_noatime] = S_NOATIME, - [__BCH_INODE_casefolded] = S_CASEFOLD, }; + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); + + if (bch2_inode_casefold(c, &inode->ei_inode)) + inode->v.i_flags |= S_CASEFOLD; } void bch2_inode_update_after_write(struct btree_trans *trans, @@ -93,7 +96,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, inode->ei_inode = *bi; - bch2_inode_flags_to_vfs(inode); + bch2_inode_flags_to_vfs(c, inode); } int __must_check bch2_write_inode(struct bch_fs *c, @@ -1470,7 +1473,6 @@ static const __maybe_unused unsigned bch_flags_to_uflags[] = { [__BCH_INODE_append] = FS_APPEND_FL, [__BCH_INODE_nodump] = FS_NODUMP_FL, [__BCH_INODE_noatime] = FS_NOATIME_FL, - [__BCH_INODE_casefolded] = FS_CASEFOLD_FL, }; /* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ @@ -1486,13 +1488,14 @@ static int bch2_fileattr_get(struct dentry *dentry, struct fileattr *fa) { struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; - if (inode->ei_inode.bi_flags & BCH_INODE_casefolded) + if (bch2_inode_casefold(c, &inode->ei_inode)) fa->flags |= FS_CASEFOLD_FL; fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; @@ -1504,6 +1507,8 @@ struct flags_set { unsigned flags; unsigned projid; bool set_project; + bool set_casefold; + bool casefold; }; static int fssetxattr_inode_update_fn(struct btree_trans *trans, @@ -1518,15 +1523,12 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, * We're relying on btree locking here for exclusion with other ioctl * calls - use the flags in the btree (@bi), not inode->i_flags: */ - unsigned newflags = s->flags; - unsigned oldflags = bi->bi_flags & s->mask; - if (!S_ISREG(bi->bi_mode) && !S_ISDIR(bi->bi_mode) && - (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) + (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) return -EINVAL; - if ((newflags ^ oldflags) & BCH_INODE_casefolded) { + if (s->casefold != bch2_inode_casefold(c, bi)) { #ifdef CONFIG_UNICODE int ret = 0; /* Not supported on individual files. */ @@ -1546,6 +1548,10 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, return ret; bch2_check_set_feature(c, BCH_FEATURE_casefolding); + + bi->bi_casefold = s->casefold + 1; + bi->bi_fields_set |= BIT(Inode_opt_casefold); + #else printk(KERN_ERR "Cannot use casefolding on a kernel without CONFIG_UNICODE\n"); return -EOPNOTSUPP; @@ -1558,7 +1564,7 @@ static int fssetxattr_inode_update_fn(struct btree_trans *trans, } bi->bi_flags &= ~s->mask; - bi->bi_flags |= newflags; + bi->bi_flags |= s->flags; bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); return 0; @@ -1598,6 +1604,11 @@ static int bch2_fileattr_set(struct mnt_idmap *idmap, if (fa->flags_valid) { s.mask = map_defined(bch_flags_to_uflags); + + s.set_casefold = true; + s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; + fa->flags &= ~FS_CASEFOLD_FL; + s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); if (fa->flags) return -EOPNOTSUPP; diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index f82cfbf460d0..c74af15b14f2 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -243,6 +243,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) } } +static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) +{ + /* inode apts are stored with a +1 bias: 0 means "unset, use fs opt" */ + return bi->bi_casefold + ? bi->bi_casefold - 1 + : c->opts.casefold; +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index 117110af1e3f..87e193e8ed25 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -103,7 +103,8 @@ struct bch_inode_generation { x(bi_parent_subvol, 32) \ x(bi_nocow, 8) \ x(bi_depth, 32) \ - x(bi_inodes_32bit, 8) + x(bi_inodes_32bit, 8) \ + x(bi_casefold, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -117,7 +118,8 @@ struct bch_inode_generation { x(background_target, 16) \ x(erasure_code, 16) \ x(nocow, 8) \ - x(inodes_32bit, 8) + x(inodes_32bit, 8) \ + x(casefold, 8) enum inode_opt_id { #define x(name, ...) \ @@ -137,8 +139,7 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) \ - x(casefolded, 10) + x(has_child_snapshot, 9) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index 0d65ea96f7a2..46f3c8b100a9 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -47,10 +47,6 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - /* Inherit casefold state from parent. */ - if (S_ISDIR(mode)) - new_inode->bi_flags |= dir_u->bi_flags & BCH_INODE_casefolded; - if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 4d06313076ff..dfb14810124c 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -228,6 +228,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(casefold, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT, \ + OPT_BOOL(), \ + BCH_SB_CASEFOLD, false, \ + NULL, "Dirent lookups are casefolded") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 09a354a26c3b..0c1a00539bd1 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -33,7 +33,7 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) struct bch_hash_info { u8 type; - struct unicode_map *cf_encoding; + struct unicode_map *cf_encoding; /* * For crc32 or crc64 string hashes the first key value of * the siphash_key (k0) is used as the key. @@ -44,11 +44,10 @@ struct bch_hash_info { static inline struct bch_hash_info bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { - /* XXX ick */ struct bch_hash_info info = { .type = INODE_STR_HASH(bi), #ifdef CONFIG_UNICODE - .cf_encoding = !!(bi->bi_flags & BCH_INODE_casefolded) ? c->cf_encoding : NULL, + .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, #endif .siphash_key = { .k0 = bi->bi_hash_seed } }; -- cgit v1.2.3 From 9cdde3c7aa3d5bfc7c7b5ec849e5a3288a66af35 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 12:58:06 -0400 Subject: bcachefs: Fix casefold lookups Add casefolding to bch2_lookup_trans: During the delay between when casefolding was written and when it was merged, the main filesystem lookup path grew self healing - which meant it was no longer using bch2_dirent_lookup_trans(), where casefolding on lookups happens. Signed-off-by: Kent Overstreet --- fs/bcachefs/dirent.c | 16 ++-------------- fs/bcachefs/dirent.h | 15 +++++++++++++++ fs/bcachefs/fs.c | 11 ++++++++--- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 8488a7578115..92ee59d9e00e 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,8 +13,8 @@ #include -static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) +int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) { *out_cf = (struct qstr) QSTR_INIT(NULL, 0); @@ -35,18 +35,6 @@ static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info * #endif } -static inline int bch2_maybe_casefold(struct btree_trans *trans, - const struct bch_hash_info *info, - const struct qstr *str, struct qstr *out_cf) -{ - if (likely(!info->cf_encoding)) { - *out_cf = *str; - return 0; - } else { - return bch2_casefold(trans, info, str, out_cf); - } -} - static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index 0880772b80a9..9838a7ba7ed1 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -23,6 +23,21 @@ struct bch_fs; struct bch_hash_info; struct bch_inode_info; +int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, + const struct qstr *, struct qstr *); + +static inline int bch2_maybe_casefold(struct btree_trans *trans, + const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + if (likely(!info->cf_encoding)) { + *out_cf = *str; + return 0; + } else { + return bch2_casefold(trans, info, str, out_cf); + } +} + struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 7c1943c83bf6..c73e97052816 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -648,13 +648,18 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dirent_iter = {}; subvol_inum inum = {}; struct printbuf buf = PRINTBUF; + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); + if (ret) + return ERR_PTR(ret); + + struct btree_iter dirent_iter = {}; struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, name, 0); - int ret = bkey_err(k); + dir_hash_info, dir, &lookup_name, 0); + ret = bkey_err(k); if (ret) return ERR_PTR(ret); -- cgit v1.2.3 From 7cb85324c4f6817d5147b9d298c71bf9b5e87c6b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 24 Apr 2025 18:07:06 -0400 Subject: bcachefs: unlink: casefold d_invalidate casefolding results in additional aliases on lookup for the non-casefolded names - these need invalidating on unlink. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index c73e97052816..5b716ffde500 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -847,6 +847,11 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, */ set_nlink(&inode->v, 0); } + + if (IS_CASEFOLDED(vdir)) { + d_invalidate(dentry); + d_prune_aliases(&inode->v); + } err: bch2_trans_put(trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); -- cgit v1.2.3 From caab547686d77733387fe734942846916374ddf2 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 21 Apr 2025 21:16:24 -0400 Subject: bcachefs: Print mount opts earlier If we aren't mounting with the correct degraded option, it's helpful to know that before we fail to mount degraded. Signed-off-by: Kent Overstreet --- fs/bcachefs/super.c | 76 ++++++++++++++++++++++++++--------------------------- 1 file changed, 37 insertions(+), 39 deletions(-) diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 4060af4692f9..e4ab0595c0ae 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -1004,6 +1004,40 @@ static void print_mount_opts(struct bch_fs *c) printbuf_exit(&p); } +static bool bch2_fs_may_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned i, flags = 0; + + if (c->opts.very_degraded) + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + + if (c->opts.degraded) + flags |= BCH_FORCE_IF_DEGRADED; + + if (!c->opts.degraded && + !c->opts.very_degraded) { + mutex_lock(&c->sb_lock); + + for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + + ca = bch2_dev_locked(c, i); + + if (!bch2_dev_is_online(ca) && + (ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro)) { + mutex_unlock(&c->sb_lock); + return false; + } + } + mutex_unlock(&c->sb_lock); + } + + return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); +} + int bch2_fs_start(struct bch_fs *c) { time64_t now = ktime_get_real_seconds(); @@ -1011,6 +1045,9 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); + if (!bch2_fs_may_start(c)) + return -BCH_ERR_insufficient_devices_to_start; + down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1537,40 +1574,6 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, } } -static bool bch2_fs_may_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i, flags = 0; - - if (c->opts.very_degraded) - flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - - if (c->opts.degraded) - flags |= BCH_FORCE_IF_DEGRADED; - - if (!c->opts.degraded && - !c->opts.very_degraded) { - mutex_lock(&c->sb_lock); - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - - ca = bch2_dev_locked(c, i); - - if (!bch2_dev_is_online(ca) && - (ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro)) { - mutex_unlock(&c->sb_lock); - return false; - } - } - mutex_unlock(&c->sb_lock); - } - - return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); -} - static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { bch2_dev_io_ref_stop(ca, WRITE); @@ -2206,11 +2209,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } up_write(&c->state_lock); - if (!bch2_fs_may_start(c)) { - ret = -BCH_ERR_insufficient_devices_to_start; - goto err_print; - } - if (!c->opts.nostart) { ret = bch2_fs_start(c); if (ret) -- cgit v1.2.3 From 394ef278e1fdadc1f27acd8c0549a8831dfc1833 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 04:44:00 -0400 Subject: bcachefs: Unit test fixes The peek_end() tests expect an empty btree. Signed-off-by: Kent Overstreet --- fs/bcachefs/tests.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index c265b102267a..782a05fe7656 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) */ static int test_peek_end(struct bch_fs *c, u64 nr) { + delete_test_keys(c); + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; @@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) static int test_peek_end_extents(struct bch_fs *c, u64 nr) { + delete_test_keys(c); + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; -- cgit v1.2.3 From c4f89a1d3590243ef1bbd55557a1f55a4a93927d Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 04:45:25 -0400 Subject: bcachefs: Make btree_iter_peek_prev() assert more precise The issue this assert is guarding against is that in BTREE_ITER_filter_snapshots mode we only want to be iterating within a single inode number - if we iterate into another inode number with keys for a different snapshot tree, we'll loop arbitrarily long before finding a key we can return. This comes up in the unit tests, where we're using inode 0 for our test keys. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e34e9598ef25..400ff650553b 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2602,7 +2602,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); int ret = trans_maybe_inject_restart(trans, _RET_IP_); if (unlikely(ret)) { -- cgit v1.2.3 From 353739f1d1675692d9de01483c75c15b314710ac Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 04:54:54 -0400 Subject: bcachefs: Fix btree_iter_peek_prev() at end of inode At the end of the inode, on an extents iterator, peek_slot() has to advance to the next position to avoid returning a 0 size extent, which is not allowed. Changing iter->pos confuses peek_prev(), but we don't need to call peek_slot() in this case. Signed-off-by: Kent Overstreet --- fs/bcachefs/btree_iter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index 400ff650553b..59fa527ac685 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -2577,7 +2577,10 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct struct bpos end) { if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && - !bkey_eq(iter->pos, POS_MAX)) { + !bkey_eq(iter->pos, POS_MAX) && + !((iter->flags & BTREE_ITER_is_extents) && + iter->pos.offset == U64_MAX)) { + /* * bkey_start_pos(), for extents, is not monotonically * increasing until after filtering for snapshots: -- cgit v1.2.3 From 28d2d19ccc8e36dacdd65303b051972926943394 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Jan 2024 12:26:56 -0500 Subject: bcachefs: drop duplicate fiemap sync flag FIEMAP_FLAG_SYNC handling was deliberately moved into core code in commit 45dd052e67ad ("fs: handle FIEMAP_FLAG_SYNC in fiemap_prep"), released in kernel v5.8. Update bcachefs accordingly. Signed-off-by: Brian Foster --- fs/bcachefs/fs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 5b716ffde500..59d919aeda74 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1330,7 +1330,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bool have_extent = false; int ret = 0; - ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(&ei->v, info, start, &len, 0); if (ret) return ret; -- cgit v1.2.3 From d020a9fb11bd85f4f16392e2a44c46ae9778b3ee Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Jan 2024 14:21:15 -0500 Subject: bcachefs: track current fiemap offset in start variable Signed-off-by: Brian Foster --- fs/bcachefs/fs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 59d919aeda74..dabec16f2639 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1387,6 +1387,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bch2_bkey_buf_realloc(&prev, c, k.k->u64s); sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + start = iter.pos.offset + sectors; bch2_cut_front(POS(k.k->p.inode, bkey_start_offset(k.k) + @@ -1407,8 +1408,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, bkey_copy(prev.k, cur.k); have_extent = true; - bch2_btree_iter_set_pos(trans, &iter, - POS(iter.pos.inode, iter.pos.offset + sectors)); + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, start)); } bch2_trans_iter_exit(trans, &iter); -- cgit v1.2.3 From 2d55a637095d0eaaad609b8a518589ead34487b3 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Jan 2024 14:27:49 -0500 Subject: bcachefs: refactor fiemap processing into extent helper and struct The bulk of the loop in bch2_fiemap() involves processing the current extent key from the iter, including following indirections and trimming the extent size and such. This patch makes a few changes to reduce the size of the loop and facilitate future changes to support delalloc extents. Define a new bch_fiemap_extent structure to wrap the bkey buffer that holds the extent key to report to userspace along with associated fiemap flags. Update bch2_fill_extent() to take the bch_fiemap_extent as a param instead of the individual fields. Finally, lift the bulk of the extent processing into a bch2_fiemap_extent() helper that takes the current key and formats the bch_fiemap_extent appropriately for the fill function. No functional changes intended by this patch. Signed-off-by: Brian Foster --- fs/bcachefs/fs.c | 87 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 34 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index dabec16f2639..00698457dee5 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1262,10 +1262,18 @@ static int bch2_tmpfile(struct mnt_idmap *idmap, return finish_open_simple(file, 0); } +struct bch_fiemap_extent { + struct bkey_buf kbuf; + unsigned flags; +}; + static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, - struct bkey_s_c k, unsigned flags) + struct bch_fiemap_extent *fe) { + struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); + unsigned flags = fe->flags; + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1318,6 +1326,36 @@ static int bch2_fill_extent(struct bch_fs *c, } } +static int bch2_fiemap_extent(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k, + struct bch_fiemap_extent *cur) +{ + s64 offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); + unsigned sectors = k.k->size - offset_into_extent; + + bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); + + enum btree_id data_btree = BTREE_ID_extents; + int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, + &cur->kbuf); + if (ret) + return ret; + + k = bkey_i_to_s_c(cur->kbuf.k); + sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + + bch2_cut_front(POS(k.k->p.inode, + bkey_start_offset(k.k) + offset_into_extent), + cur->kbuf.k); + bch2_key_resize(&cur->kbuf.k->k, sectors); + cur->kbuf.k->k.p = iter->pos; + cur->kbuf.k->k.p.offset += cur->kbuf.k->k.size; + + cur->flags = 0; + + return 0; +} + static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { @@ -1326,7 +1364,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, struct btree_trans *trans; struct btree_iter iter; struct bkey_s_c k; - struct bkey_buf cur, prev; + struct bch_fiemap_extent cur, prev; bool have_extent = false; int ret = 0; @@ -1340,15 +1378,14 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, start >>= 9; - bch2_bkey_buf_init(&cur); - bch2_bkey_buf_init(&prev); + bch2_bkey_buf_init(&cur.kbuf); + bch2_bkey_buf_init(&prev.kbuf); trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, POS(ei->v.i_ino, start), 0); while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - enum btree_id data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1373,39 +1410,21 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, continue; } - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&cur, c, k); - - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &cur); + ret = bch2_fiemap_extent(trans, &iter, k, &cur); if (ret) - continue; - - k = bkey_i_to_s_c(cur.k); - bch2_bkey_buf_realloc(&prev, c, k.k->u64s); - - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - start = iter.pos.offset + sectors; - - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + - offset_into_extent), - cur.k); - bch2_key_resize(&cur.k->k, sectors); - cur.k->k.p = iter.pos; - cur.k->k.p.offset += cur.k->k.size; + break; + bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s); + start = cur.kbuf.k->k.p.offset; if (have_extent) { bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, - bkey_i_to_s_c(prev.k), 0); + ret = bch2_fill_extent(c, info, &prev); if (ret) break; } - bkey_copy(prev.k, cur.k); + bkey_copy(prev.kbuf.k, cur.kbuf.k); + prev.flags = cur.flags; have_extent = true; bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, start)); @@ -1414,13 +1433,13 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (!ret && have_extent) { bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), - FIEMAP_EXTENT_LAST); + prev.flags |= FIEMAP_EXTENT_LAST; + ret = bch2_fill_extent(c, info, &prev); } bch2_trans_put(trans); - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); + bch2_bkey_buf_exit(&cur.kbuf, c); + bch2_bkey_buf_exit(&prev.kbuf, c); return ret < 0 ? ret : 0; } -- cgit v1.2.3 From b9b0494017b5f6d0664ecbcd2d8870800f045581 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Mon, 15 Jan 2024 14:46:00 -0500 Subject: bcachefs: add fiemap delalloc extent detection bcachefs currently populates fiemap data from the extents btree. This works correctly when the fiemap sync flag is provided, but if not, it skips all delalloc extents that have not yet been flushed. This is because delalloc extents from buffered writes are first stored as reservation in the pagecache, and only become resident in the extents btree after writeback completes. Update the fiemap implementation to process holes between extents by scanning pagecache for data, via seek data/hole. If a valid data range is found over a hole in the extent btree, fake up an extent key and flag the extent as delalloc for reporting to userspace. Note that this does not necessarily change behavior for the case where there is dirty pagecache over already written extents, where when in COW mode, writeback will allocate new blocks for the underlying ranges. The existing behavior is consistent with btrfs and it is recommended to use the sync flag for the most up to date extent state from fiemap. Signed-off-by: Brian Foster --- fs/bcachefs/fs.c | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 112 insertions(+), 7 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 00698457dee5..f5a9f9e8c457 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1356,6 +1356,87 @@ static int bch2_fiemap_extent(struct btree_trans *trans, return 0; } +/* + * Scan a range of an inode for data in pagecache. + * + * Intended to be retryable, so don't modify the output params until success is + * imminent. + */ +static int +bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, + bool nonblock) +{ + loff_t dstart, dend; + + dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); + if (dstart < 0) + return dstart; + if (dstart >= *end) + return -ENOENT; + + dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); + if (dend < 0) + return dend; + + *start = dstart; + *end = dend; + return 0; +} + +/* + * Scan a range of pagecache that corresponds to a file mapping hole in the + * extent btree. If data is found, fake up an extent key so it looks like a + * delalloc extent to the rest of the fiemap processing code. + * + * Returns 0 if cached data was found, -ENOENT if not. + */ +static int +bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start, + u64 end, struct bch_fiemap_extent *cur) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct bkey_i_extent *delextent; + struct bch_extent_ptr ptr = {}; + loff_t dstart = start, dend = end; + int ret; + + /* + * We hold btree locks here so we cannot block on folio locks without + * dropping trans locks first. Run a nonblocking scan for the common + * case of no folios over holes and fall back on failure. + * + * Note that dropping locks like this is technically racy against + * writeback inserting to the extent tree, but a non-sync fiemap scan is + * fundamentally racy with writeback anyways. Therefore, just report the + * range as delalloc regardless of whether we have to cycle trans locks. + */ + ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, true); + if (ret == -EAGAIN) { + /* open coded drop_locks_do() to relock even on error */ + bch2_trans_unlock(trans); + ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, false); + bch2_trans_relock(trans); + } + if (ret < 0) + return ret; + + /* + * Create a fake extent key in the buffer. We have to add a dummy extent + * pointer for the fill code to add an extent entry. It's explicitly + * zeroed to reflect delayed allocation (i.e. phys offset 0). + */ + bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); + delextent = bkey_extent_init(cur->kbuf.k); + delextent->k.p = POS(ei->v.i_ino, dstart >> 9); + bch2_key_resize(&delextent->k, (dend - dstart) >> 9); + bch2_bkey_append_ptr(&delextent->k_i, ptr); + + cur->flags = FIEMAP_EXTENT_DELALLOC; + + return 0; +} + static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { @@ -1386,6 +1467,7 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, POS(ei->v.i_ino, start), 0); while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bool have_delalloc = false; bch2_trans_begin(trans); @@ -1404,15 +1486,38 @@ static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, if (!k.k) break; - if (!bkey_extent_is_data(k.k) && - k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(trans, &iter); - continue; + /* + * If a hole exists before the start of the extent key, scan the + * range for pagecache data that might be pending writeback and + * thus not yet exist in the extent tree. + */ + if (iter.pos.offset > start) { + ret = bch2_fiemap_hole(trans, vinode, start << 9, + iter.pos.offset << 9, &cur); + if (!ret) + have_delalloc = true; + else if (ret != -ENOENT) + break; } - ret = bch2_fiemap_extent(trans, &iter, k, &cur); - if (ret) - break; + /* process the current key if there's no delalloc to report */ + if (!have_delalloc) { + if (!bkey_extent_is_data(k.k) && + k.k->type != KEY_TYPE_reservation) { + start = bkey_start_offset(k.k) + k.k->size; + bch2_btree_iter_advance(trans, &iter); + continue; + } + + ret = bch2_fiemap_extent(trans, &iter, k, &cur); + if (ret) + break; + } + + /* + * Store the current extent in prev so we can flag the last + * extent on the way out. + */ bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s); start = cur.kbuf.k->k.p.offset; -- cgit v1.2.3 From d1b0f9aa73fe50ee5276708e33d77c4e7054e555 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Tue, 22 Apr 2025 11:52:45 -0400 Subject: bcachefs: Rework fiemap transaction restart handling Restart handling in the previous patch was incorrect, so: move btree operations into a separate helper, and run it with a lockrestart_do(). Additionally, clarify whether pagecache or the btree takes precedence. Right now, the btree takes precedence: this is incorrect, but it's needed to pass fstests. Add a giant comment explaining why. Signed-off-by: Kent Overstreet --- fs/bcachefs/fs.c | 225 +++++++++++++++++++++++++++---------------------------- 1 file changed, 112 insertions(+), 113 deletions(-) diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index f5a9f9e8c457..0f1d61aab90b 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -1274,6 +1274,8 @@ static int bch2_fill_extent(struct bch_fs *c, struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); unsigned flags = fe->flags; + BUG_ON(!k.k->size); + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1326,36 +1328,6 @@ static int bch2_fill_extent(struct bch_fs *c, } } -static int bch2_fiemap_extent(struct btree_trans *trans, - struct btree_iter *iter, struct bkey_s_c k, - struct bch_fiemap_extent *cur) -{ - s64 offset_into_extent = iter->pos.offset - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; - - bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); - - enum btree_id data_btree = BTREE_ID_extents; - int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, - &cur->kbuf); - if (ret) - return ret; - - k = bkey_i_to_s_c(cur->kbuf.k); - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); - - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + offset_into_extent), - cur->kbuf.k); - bch2_key_resize(&cur->kbuf.k->k, sectors); - cur->kbuf.k->k.p = iter->pos; - cur->kbuf.k->k.p.offset += cur->kbuf.k->k.size; - - cur->flags = 0; - - return 0; -} - /* * Scan a range of an inode for data in pagecache. * @@ -1371,13 +1343,19 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); if (dstart < 0) return dstart; - if (dstart >= *end) - return -ENOENT; + + if (dstart == *end) { + *start = dstart; + return 0; + } dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); if (dend < 0) return dend; + /* race */ + BUG_ON(dstart == dend); + *start = dstart; *end = dend; return 0; @@ -1387,18 +1365,15 @@ bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, * Scan a range of pagecache that corresponds to a file mapping hole in the * extent btree. If data is found, fake up an extent key so it looks like a * delalloc extent to the rest of the fiemap processing code. - * - * Returns 0 if cached data was found, -ENOENT if not. */ static int -bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start, - u64 end, struct bch_fiemap_extent *cur) +bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, + u64 start, u64 end, struct bch_fiemap_extent *cur) { - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(vinode); + struct bch_fs *c = trans->c; struct bkey_i_extent *delextent; struct bch_extent_ptr ptr = {}; - loff_t dstart = start, dend = end; + loff_t dstart = start << 9, dend = end << 9; int ret; /* @@ -1411,13 +1386,10 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start, * fundamentally racy with writeback anyways. Therefore, just report the * range as delalloc regardless of whether we have to cycle trans locks. */ - ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, true); - if (ret == -EAGAIN) { - /* open coded drop_locks_do() to relock even on error */ - bch2_trans_unlock(trans); - ret = bch2_fiemap_hole_pagecache(vinode, &dstart, &dend, false); - bch2_trans_relock(trans); - } + ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); + if (ret == -EAGAIN) + ret = drop_locks_do(trans, + bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); if (ret < 0) return ret; @@ -1428,8 +1400,8 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start, */ bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); delextent = bkey_extent_init(cur->kbuf.k); - delextent->k.p = POS(ei->v.i_ino, dstart >> 9); - bch2_key_resize(&delextent->k, (dend - dstart) >> 9); + delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); + delextent->k.size = (dend - dstart) >> 9; bch2_bkey_append_ptr(&delextent->k_i, ptr); cur->flags = FIEMAP_EXTENT_DELALLOC; @@ -1437,115 +1409,142 @@ bch2_fiemap_hole(struct btree_trans *trans, struct inode *vinode, u64 start, return 0; } +static int bch2_next_fiemap_extent(struct btree_trans *trans, + struct bch_inode_info *inode, + u64 start, u64 end, + struct bch_fiemap_extent *cur) +{ + u32 snapshot; + int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); + if (ret) + return ret; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, + SPOS(inode->ei_inum.inum, start, snapshot), 0); + + struct bkey_s_c k = + bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); + ret = bkey_err(k); + if (ret) + goto err; + + ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, end, cur); + if (ret) + goto err; + + struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); + + /* + * Does the pagecache or the btree take precedence? + * + * It _should_ be the pagecache, so that we correctly report delalloc + * extents when dirty in the pagecache (we're COW, after all). + * + * But we'd have to add per-sector writeback tracking to + * bch_folio_state, otherwise we report delalloc extents for clean + * cached data in the pagecache. + * + * We should do this, but even then fiemap won't report stable mappings: + * on bcachefs data moves around in the background (copygc, rebalance) + * and we don't provide a way for userspace to lock that out. + */ + if (k.k && + bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), + pagecache_start)) { + bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); + bch2_cut_front(iter.pos, cur->kbuf.k); + bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); + cur->flags = 0; + } else if (k.k) { + bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); + } + + if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { + unsigned sectors = cur->kbuf.k->k.size; + s64 offset_into_extent = 0; + enum btree_id data_btree = BTREE_ID_extents; + int ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, + &cur->kbuf); + if (ret) + goto err; + + struct bkey_i *k = cur->kbuf.k; + sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); + + bch2_cut_front(POS(k->k.p.inode, + bkey_start_offset(&k->k) + offset_into_extent), + k); + bch2_key_resize(&k->k, sectors); + k->k.p = iter.pos; + k->k.p.offset += k->k.size; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, u64 start, u64 len) { struct bch_fs *c = vinode->i_sb->s_fs_info; struct bch_inode_info *ei = to_bch_ei(vinode); struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; struct bch_fiemap_extent cur, prev; - bool have_extent = false; int ret = 0; ret = fiemap_prep(&ei->v, info, start, &len, 0); if (ret) return ret; - struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); if (start + len < start) return -EINVAL; start >>= 9; + u64 end = (start + len) >> 9; bch2_bkey_buf_init(&cur.kbuf); bch2_bkey_buf_init(&prev.kbuf); - trans = bch2_trans_get(c); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(ei->v.i_ino, start), 0); + bkey_init(&prev.kbuf.k->k); - while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - bool have_delalloc = false; - - bch2_trans_begin(trans); + trans = bch2_trans_get(c); - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); + while (start < end) { + ret = lockrestart_do(trans, + bch2_next_fiemap_extent(trans, ei, start, end, &cur)); if (ret) - continue; - - bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + goto err; - k = bch2_btree_iter_peek_max(trans, &iter, end); - ret = bkey_err(k); - if (ret) - continue; + BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); + BUG_ON(cur.kbuf.k->k.p.offset > end); - if (!k.k) + if (bkey_start_offset(&cur.kbuf.k->k) == end) break; - /* - * If a hole exists before the start of the extent key, scan the - * range for pagecache data that might be pending writeback and - * thus not yet exist in the extent tree. - */ - if (iter.pos.offset > start) { - ret = bch2_fiemap_hole(trans, vinode, start << 9, - iter.pos.offset << 9, &cur); - if (!ret) - have_delalloc = true; - else if (ret != -ENOENT) - break; - } - - /* process the current key if there's no delalloc to report */ - if (!have_delalloc) { - if (!bkey_extent_is_data(k.k) && - k.k->type != KEY_TYPE_reservation) { - start = bkey_start_offset(k.k) + k.k->size; - bch2_btree_iter_advance(trans, &iter); - continue; - } - - ret = bch2_fiemap_extent(trans, &iter, k, &cur); - if (ret) - break; - } - - /* - * Store the current extent in prev so we can flag the last - * extent on the way out. - */ - bch2_bkey_buf_realloc(&prev.kbuf, c, cur.kbuf.k->k.u64s); start = cur.kbuf.k->k.p.offset; - if (have_extent) { + if (!bkey_deleted(&prev.kbuf.k->k)) { bch2_trans_unlock(trans); ret = bch2_fill_extent(c, info, &prev); if (ret) - break; + goto err; } - bkey_copy(prev.kbuf.k, cur.kbuf.k); + bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); prev.flags = cur.flags; - have_extent = true; - - bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, start)); } - bch2_trans_iter_exit(trans, &iter); - if (!ret && have_extent) { + if (!bkey_deleted(&prev.kbuf.k->k)) { bch2_trans_unlock(trans); prev.flags |= FIEMAP_EXTENT_LAST; ret = bch2_fill_extent(c, info, &prev); } - +err: bch2_trans_put(trans); bch2_bkey_buf_exit(&cur.kbuf, c); bch2_bkey_buf_exit(&prev.kbuf, c); - return ret < 0 ? ret : 0; + + return bch2_err_class(ret < 0 ? ret : 0); } static const struct vm_operations_struct bch_vm_ops = { -- cgit v1.2.3 From d6aa0c178bf81f30ae4a780b2bca653daa2eb633 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Apr 2025 09:37:39 +0800 Subject: ublk: call ublk_dispatch_req() for handling UBLK_U_IO_NEED_GET_DATA We call io_uring_cmd_complete_in_task() to schedule task_work for handling UBLK_U_IO_NEED_GET_DATA. This way is really not necessary because the current context is exactly the ublk queue context, so call ublk_dispatch_req() directly for handling UBLK_U_IO_NEED_GET_DATA. Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Tested-by: Jared Holzman Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250425013742.1079549-2-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index 2de7b2bd409d..c4d4be4f6fbd 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1886,15 +1886,6 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) } } -static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id, - int tag) -{ - struct ublk_queue *ubq = ublk_get_queue(ub, q_id); - struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag); - - ublk_queue_cmd(ubq, req); -} - static inline int ublk_check_cmd_op(u32 cmd_op) { u32 ioc_type = _IOC_TYPE(cmd_op); @@ -2103,8 +2094,9 @@ static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd, if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) goto out; ublk_fill_io_cmd(io, cmd, ub_cmd->addr); - ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag); - break; + req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag); + ublk_dispatch_req(ubq, req, issue_flags); + return -EIOCBQUEUED; default: goto out; } -- cgit v1.2.3 From f40139fde5278d81af3227444fd6e76a76b9506d Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Fri, 25 Apr 2025 09:37:40 +0800 Subject: ublk: fix race between io_uring_cmd_complete_in_task and ublk_cancel_cmd ublk_cancel_cmd() calls io_uring_cmd_done() to complete uring_cmd, but we may have scheduled task work via io_uring_cmd_complete_in_task() for dispatching request, then kernel crash can be triggered. Fix it by not trying to canceling the command if ublk block request is started. Fixes: 216c8f5ef0f2 ("ublk: replace monitor with cancelable uring_cmd") Reported-by: Jared Holzman Tested-by: Jared Holzman Closes: https://lore.kernel.org/linux-block/d2179120-171b-47ba-b664-23242981ef19@nvidia.com/ Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20250425013742.1079549-3-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index c4d4be4f6fbd..40f971a66d3e 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1683,14 +1683,31 @@ static void ublk_start_cancel(struct ublk_queue *ubq) ublk_put_disk(disk); } -static void ublk_cancel_cmd(struct ublk_queue *ubq, struct ublk_io *io, +static void ublk_cancel_cmd(struct ublk_queue *ubq, unsigned tag, unsigned int issue_flags) { + struct ublk_io *io = &ubq->ios[tag]; + struct ublk_device *ub = ubq->dev; + struct request *req; bool done; if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) return; + /* + * Don't try to cancel this command if the request is started for + * avoiding race between io_uring_cmd_done() and + * io_uring_cmd_complete_in_task(). + * + * Either the started request will be aborted via __ublk_abort_rq(), + * then this uring_cmd is canceled next time, or it will be done in + * task work function ublk_dispatch_req() because io_uring guarantees + * that ublk_dispatch_req() is always called + */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], tag); + if (req && blk_mq_request_started(req)) + return; + spin_lock(&ubq->cancel_lock); done = !!(io->flags & UBLK_IO_FLAG_CANCELED); if (!done) @@ -1722,7 +1739,6 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); struct ublk_queue *ubq = pdu->ubq; struct task_struct *task; - struct ublk_io *io; if (WARN_ON_ONCE(!ubq)) return; @@ -1737,9 +1753,8 @@ static void ublk_uring_cmd_cancel_fn(struct io_uring_cmd *cmd, if (!ubq->canceling) ublk_start_cancel(ubq); - io = &ubq->ios[pdu->tag]; - WARN_ON_ONCE(io->cmd != cmd); - ublk_cancel_cmd(ubq, io, issue_flags); + WARN_ON_ONCE(ubq->ios[pdu->tag].cmd != cmd); + ublk_cancel_cmd(ubq, pdu->tag, issue_flags); } static inline bool ublk_queue_ready(struct ublk_queue *ubq) @@ -1752,7 +1767,7 @@ static void ublk_cancel_queue(struct ublk_queue *ubq) int i; for (i = 0; i < ubq->q_depth; i++) - ublk_cancel_cmd(ubq, &ubq->ios[i], IO_URING_F_UNLOCKED); + ublk_cancel_cmd(ubq, i, IO_URING_F_UNLOCKED); } /* Cancel all pending commands, must be called after del_gendisk() returns */ -- cgit v1.2.3 From e079d7c4db5cba1e8a315dc93030dfb6c7b49459 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 23 Apr 2025 06:59:41 +0200 Subject: devtmpfs: don't use vfs_getattr_nosec to query i_mode The recent move of the bdev_statx call to the low-level vfs_getattr_nosec helper caused it being used by devtmpfs, which leads to deadlocks in md teardown due to the block device lookup and put interfering with the unusual lifetime rules in md. But as handle_remove only works on inodes created and owned by devtmpfs itself there is no need to use vfs_getattr_nosec vs simply reading the mode from the inode directly. Switch to that to avoid the bdev lookup or any other unintentional side effect. Reported-by: Shin'ichiro Kawasaki Reported-by: Xiao Ni Fixes: 777d0961ff95 ("fs: move the bdex_statx call to vfs_getattr_nosec") Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/20250423045941.1667425-1-hch@lst.de Tested-by: Shin'ichiro Kawasaki Tested-by: Xiao Ni Tested-by: Ayush Jain Tested-by: Heiko Carstens Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- drivers/base/devtmpfs.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 6dd1a8860f1c..31bfb3194b4c 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -296,7 +296,7 @@ static int delete_path(const char *nodepath) return err; } -static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *stat) +static int dev_mynode(struct device *dev, struct inode *inode) { /* did we create it */ if (inode->i_private != &thread) @@ -304,13 +304,13 @@ static int dev_mynode(struct device *dev, struct inode *inode, struct kstat *sta /* does the dev_t match */ if (is_blockdev(dev)) { - if (!S_ISBLK(stat->mode)) + if (!S_ISBLK(inode->i_mode)) return 0; } else { - if (!S_ISCHR(stat->mode)) + if (!S_ISCHR(inode->i_mode)) return 0; } - if (stat->rdev != dev->devt) + if (inode->i_rdev != dev->devt) return 0; /* ours */ @@ -321,20 +321,16 @@ static int handle_remove(const char *nodename, struct device *dev) { struct path parent; struct dentry *dentry; - struct kstat stat; - struct path p; + struct inode *inode; int deleted = 0; - int err; + int err = 0; dentry = kern_path_locked(nodename, &parent); if (IS_ERR(dentry)) return PTR_ERR(dentry); - p.mnt = parent.mnt; - p.dentry = dentry; - err = vfs_getattr(&p, &stat, STATX_TYPE | STATX_MODE, - AT_STATX_SYNC_AS_STAT); - if (!err && dev_mynode(dev, d_inode(dentry), &stat)) { + inode = d_inode(dentry); + if (dev_mynode(dev, inode)) { struct iattr newattrs; /* * before unlinking this node, reset permissions @@ -342,7 +338,7 @@ static int handle_remove(const char *nodename, struct device *dev) */ newattrs.ia_uid = GLOBAL_ROOT_UID; newattrs.ia_gid = GLOBAL_ROOT_GID; - newattrs.ia_mode = stat.mode & ~0777; + newattrs.ia_mode = inode->i_mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; inode_lock(d_inode(dentry)); -- cgit v1.2.3 From e6f141b332ddd9007756751b6afd24f799488fd8 Mon Sep 17 00:00:00 2001 From: "T.J. Mercier" Date: Wed, 23 Apr 2025 18:00:23 +0000 Subject: splice: remove duplicate noinline from pipe_clear_nowait pipe_clear_nowait has two noinline macros, but we only need one. I checked the whole tree, and this is the only occurrence: $ grep -r "noinline .* noinline" fs/splice.c:static noinline void noinline pipe_clear_nowait(struct file *file) $ Fixes: 0f99fc513ddd ("splice: clear FMODE_NOWAIT on file if splice/vmsplice is used") Signed-off-by: "T.J. Mercier" Link: https://lore.kernel.org/20250423180025.2627670-1-tjmercier@google.com Reviewed-by: Jens Axboe Signed-off-by: Christian Brauner --- fs/splice.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index 90d464241f15..4d6df083e0c0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -45,7 +45,7 @@ * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ -static noinline void noinline pipe_clear_nowait(struct file *file) +static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); -- cgit v1.2.3 From 1d28f25d6a6c968ebee8ee6d5b65691d5bfcf95f Mon Sep 17 00:00:00 2001 From: Yangtao Li Date: Wed, 23 Apr 2025 06:34:22 -0600 Subject: MAINTAINERS: hfs/hfsplus: add myself as maintainer I used to maintain Allwinner SoC cpufreq and thermal drivers and have some work experience in the F2FS file system. I volunteered to maintain the code together with Slava and Adrian. Signed-off-by: Yangtao Li Link: https://lore.kernel.org/20250423123423.2062619-1-frank.li@vivo.com Acked-by: John Paul Adrian Glaubitz Signed-off-by: Christian Brauner --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index b8d1e41c27f6..c3116274cec3 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10459,6 +10459,7 @@ F: drivers/infiniband/hw/hfi1 HFS FILESYSTEM M: Viacheslav Dubeyko M: John Paul Adrian Glaubitz +M: Yangtao Li L: linux-fsdevel@vger.kernel.org S: Maintained F: Documentation/filesystems/hfs.rst @@ -10467,6 +10468,7 @@ F: fs/hfs/ HFSPLUS FILESYSTEM M: Viacheslav Dubeyko M: John Paul Adrian Glaubitz +M: Yangtao Li L: linux-fsdevel@vger.kernel.org S: Maintained F: Documentation/filesystems/hfsplus.rst -- cgit v1.2.3 From f520bed25d17bb31c2d2d72b0a785b593a4e3179 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 24 Apr 2025 15:22:47 +0200 Subject: fs/xattr: Fix handling of AT_FDCWD in setxattrat(2) and getxattrat(2) Currently, setxattrat(2) and getxattrat(2) are wrongly handling the calls of the from setxattrat(AF_FDCWD, NULL, AT_EMPTY_PATH, ...) and fail with -EBADF error instead of operating on CWD. Fix it. Fixes: 6140be90ec70 ("fs/xattr: add *at family syscalls") Signed-off-by: Jan Kara Link: https://lore.kernel.org/20250424132246.16822-2-jack@suse.cz Signed-off-by: Christian Brauner --- fs/xattr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index 02bee149ad96..fabb2a04501e 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname, return error; filename = getname_maybe_null(pathname, at_flags); - if (!filename) { + if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; @@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname, return error; filename = getname_maybe_null(pathname, at_flags); - if (!filename) { + if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; -- cgit v1.2.3 From 3dfc0445274252301dfcf3980d79acceea6409d1 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 7 Apr 2025 16:33:06 +0300 Subject: MAINTAINERS: Assign maintainer for the port controller drivers Especially the port manager (tcpm.c) is so major driver that it should have somebody watching over it who really understands it, and the port controller interface in general. Assigning Badhri as the designated reviewer and restoring the status to Maintained from Orphan. Signed-off-by: Heikki Krogerus Cc: Badhri Jagan Sridharan Acked-by: Badhri Jagan Sridharan Link: https://lore.kernel.org/r/20250407133306.387576-1-heikki.krogerus@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- MAINTAINERS | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index 96b827049501..71408373ce22 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -25126,9 +25126,13 @@ S: Maintained F: drivers/usb/typec/mux/pi3usb30532.c USB TYPEC PORT CONTROLLER DRIVERS +M: Badhri Jagan Sridharan L: linux-usb@vger.kernel.org -S: Orphan -F: drivers/usb/typec/tcpm/ +S: Maintained +F: drivers/usb/typec/tcpm/tcpci.c +F: drivers/usb/typec/tcpm/tcpm.c +F: include/linux/usb/tcpci.h +F: include/linux/usb/tcpm.h USB TYPEC TUSB1046 MUX DRIVER M: Romain Gantois -- cgit v1.2.3 From 75673fda0c557ae26078177dd14d4857afbf128d Mon Sep 17 00:00:00 2001 From: Brandon Kammerdiener Date: Thu, 24 Apr 2025 11:32:51 -0400 Subject: bpf: fix possible endless loop in BPF map iteration The _safe variant used here gets the next element before running the callback, avoiding the endless loop condition. Signed-off-by: Brandon Kammerdiener Link: https://lore.kernel.org/r/20250424153246.141677-2-brandon.kammerdiener@intel.com Signed-off-by: Alexei Starovoitov Acked-by: Hou Tao --- kernel/bpf/hashtab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 5a5adc66b8e2..92b606d60020 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2189,7 +2189,7 @@ static long bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_ b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ -- cgit v1.2.3 From 3d9c463f959f41cd6616ebf8a5d15e9d3ef04f16 Mon Sep 17 00:00:00 2001 From: Brandon Kammerdiener Date: Thu, 24 Apr 2025 11:32:55 -0400 Subject: selftests/bpf: add test for softlock when modifying hashmap while iterating Add test that modifies the map while it's being iterated in such a way that hangs the kernel thread unless the _safe fix is applied to bpf_for_each_hash_elem. Signed-off-by: Brandon Kammerdiener Link: https://lore.kernel.org/r/20250424153246.141677-3-brandon.kammerdiener@intel.com Signed-off-by: Alexei Starovoitov Acked-by: Hou Tao --- tools/testing/selftests/bpf/prog_tests/for_each.c | 37 ++++++++++++++++++++++ .../selftests/bpf/progs/for_each_hash_modify.c | 30 ++++++++++++++++++ 2 files changed, 67 insertions(+) create mode 100644 tools/testing/selftests/bpf/progs/for_each_hash_modify.c diff --git a/tools/testing/selftests/bpf/prog_tests/for_each.c b/tools/testing/selftests/bpf/prog_tests/for_each.c index 09f6487f58b9..5fea3209566e 100644 --- a/tools/testing/selftests/bpf/prog_tests/for_each.c +++ b/tools/testing/selftests/bpf/prog_tests/for_each.c @@ -6,6 +6,7 @@ #include "for_each_array_map_elem.skel.h" #include "for_each_map_elem_write_key.skel.h" #include "for_each_multi_maps.skel.h" +#include "for_each_hash_modify.skel.h" static unsigned int duration; @@ -203,6 +204,40 @@ out: for_each_multi_maps__destroy(skel); } +static void test_hash_modify(void) +{ + struct for_each_hash_modify *skel; + int max_entries, i, err; + __u64 key, val; + + LIBBPF_OPTS(bpf_test_run_opts, topts, + .data_in = &pkt_v4, + .data_size_in = sizeof(pkt_v4), + .repeat = 1 + ); + + skel = for_each_hash_modify__open_and_load(); + if (!ASSERT_OK_PTR(skel, "for_each_hash_modify__open_and_load")) + return; + + max_entries = bpf_map__max_entries(skel->maps.hashmap); + for (i = 0; i < max_entries; i++) { + key = i; + val = i; + err = bpf_map__update_elem(skel->maps.hashmap, &key, sizeof(key), + &val, sizeof(val), BPF_ANY); + if (!ASSERT_OK(err, "map_update")) + goto out; + } + + err = bpf_prog_test_run_opts(bpf_program__fd(skel->progs.test_pkt_access), &topts); + ASSERT_OK(err, "bpf_prog_test_run_opts"); + ASSERT_OK(topts.retval, "retval"); + +out: + for_each_hash_modify__destroy(skel); +} + void test_for_each(void) { if (test__start_subtest("hash_map")) @@ -213,4 +248,6 @@ void test_for_each(void) test_write_map_key(); if (test__start_subtest("multi_maps")) test_multi_maps(); + if (test__start_subtest("hash_modify")) + test_hash_modify(); } diff --git a/tools/testing/selftests/bpf/progs/for_each_hash_modify.c b/tools/testing/selftests/bpf/progs/for_each_hash_modify.c new file mode 100644 index 000000000000..82307166f789 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/for_each_hash_modify.c @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2025 Intel Corporation */ +#include "vmlinux.h" +#include + +char _license[] SEC("license") = "GPL"; + +struct { + __uint(type, BPF_MAP_TYPE_HASH); + __uint(max_entries, 128); + __type(key, __u64); + __type(value, __u64); +} hashmap SEC(".maps"); + +static int cb(struct bpf_map *map, __u64 *key, __u64 *val, void *arg) +{ + bpf_map_delete_elem(map, key); + bpf_map_update_elem(map, key, val, 0); + return 0; +} + +SEC("tc") +int test_pkt_access(struct __sk_buff *skb) +{ + (void)skb; + + bpf_for_each_map_elem(&hashmap, cb, NULL, 0); + + return 0; +} -- cgit v1.2.3 From f88886de0927a2adf4c1b4c5c1f1d31d2023ef74 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 24 Apr 2025 18:45:42 -0700 Subject: bpf: Add namespace to BPF internal symbols Add namespace to BPF internal symbols used by light skeleton to prevent abuse and document with the code their allowed usage. Fixes: b1d18a7574d0 ("bpf: Extend sys_bpf commands for bpf_syscall programs.") Signed-off-by: Alexei Starovoitov Signed-off-by: Andrii Nakryiko Acked-by: Kumar Kartikeya Dwivedi Link: https://lore.kernel.org/bpf/20250425014542.62385-1-alexei.starovoitov@gmail.com --- Documentation/bpf/bpf_devel_QA.rst | 8 ++++++++ kernel/bpf/preload/bpf_preload_kern.c | 1 + kernel/bpf/syscall.c | 6 +++--- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/Documentation/bpf/bpf_devel_QA.rst b/Documentation/bpf/bpf_devel_QA.rst index de27e1620821..0acb4c9b8d90 100644 --- a/Documentation/bpf/bpf_devel_QA.rst +++ b/Documentation/bpf/bpf_devel_QA.rst @@ -382,6 +382,14 @@ In case of new BPF instructions, once the changes have been accepted into the Linux kernel, please implement support into LLVM's BPF back end. See LLVM_ section below for further information. +Q: What "BPF_INTERNAL" symbol namespace is for? +----------------------------------------------- +A: Symbols exported as BPF_INTERNAL can only be used by BPF infrastructure +like preload kernel modules with light skeleton. Most symbols outside +of BPF_INTERNAL are not expected to be used by code outside of BPF either. +Symbols may lack the designation because they predate the namespaces, +or due to an oversight. + Stable submission ================= diff --git a/kernel/bpf/preload/bpf_preload_kern.c b/kernel/bpf/preload/bpf_preload_kern.c index 2fdf3c978db1..774e5a538811 100644 --- a/kernel/bpf/preload/bpf_preload_kern.c +++ b/kernel/bpf/preload/bpf_preload_kern.c @@ -89,5 +89,6 @@ static void __exit fini(void) } late_initcall(load); module_exit(fini); +MODULE_IMPORT_NS("BPF_INTERNAL"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Embedded BPF programs for introspection in bpffs"); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 9794446bc8c6..64c3393e8270 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -1583,7 +1583,7 @@ struct bpf_map *bpf_map_get(u32 ufd) return map; } -EXPORT_SYMBOL(bpf_map_get); +EXPORT_SYMBOL_NS(bpf_map_get, "BPF_INTERNAL"); struct bpf_map *bpf_map_get_with_uref(u32 ufd) { @@ -3364,7 +3364,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) bpf_link_inc(link); return link; } -EXPORT_SYMBOL(bpf_link_get_from_fd); +EXPORT_SYMBOL_NS(bpf_link_get_from_fd, "BPF_INTERNAL"); static void bpf_tracing_link_release(struct bpf_link *link) { @@ -6020,7 +6020,7 @@ int kern_sys_bpf(int cmd, union bpf_attr *attr, unsigned int size) return ____bpf_sys_bpf(cmd, attr, size); } } -EXPORT_SYMBOL(kern_sys_bpf); +EXPORT_SYMBOL_NS(kern_sys_bpf, "BPF_INTERNAL"); static const struct bpf_func_proto bpf_sys_bpf_proto = { .func = bpf_sys_bpf, -- cgit v1.2.3 From 548762f05d19c5542db7590bcdfb9be1fb928376 Mon Sep 17 00:00:00 2001 From: Haoran Jiang Date: Fri, 25 Apr 2025 17:50:42 +0800 Subject: samples/bpf: Fix compilation failure for samples/bpf on LoongArch Fedora When building the latest samples/bpf on LoongArch Fedora make M=samples/bpf There are compilation errors as follows: In file included from ./linux/samples/bpf/sockex2_kern.c:2: In file included from ./include/uapi/linux/in.h:25: In file included from ./include/linux/socket.h:8: In file included from ./include/linux/uio.h:9: In file included from ./include/linux/thread_info.h:60: In file included from ./arch/loongarch/include/asm/thread_info.h:15: In file included from ./arch/loongarch/include/asm/processor.h:13: In file included from ./arch/loongarch/include/asm/cpu-info.h:11: ./arch/loongarch/include/asm/loongarch.h:13:10: fatal error: 'larchintrin.h' file not found ^~~~~~~~~~~~~~~ 1 error generated. larchintrin.h is included in /usr/lib64/clang/14.0.6/include, and the header file location is specified at compile time. Test on LoongArch Fedora: https://github.com/fedora-remix-loongarch/releases-info Signed-off-by: Haoran Jiang Signed-off-by: zhangxi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250425095042.838824-1-jianghaoran@kylinos.cn --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 5b632635e00d..95a4fa1f1e44 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -376,7 +376,7 @@ $(obj)/%.o: $(src)/%.c @echo " CLANG-bpf " $@ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ - -I$(LIBBPF_INCLUDE) \ + -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ -- cgit v1.2.3 From f0007910784a61556e94c42b401a38116a899c73 Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Fri, 25 Apr 2025 21:37:10 +0000 Subject: selftests/bpf: Correct typo in __clang_major__ macro Make sure that CAN_USE_BPF_ST test (compute_live_registers/store) is enabled when __clang_major__ >= 18. Fixes: 2ea8f6a1cda7 ("selftests/bpf: test cases for compute_live_registers()") Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/20250425213712.1542077-1-yepeilin@google.com Signed-off-by: Alexei Starovoitov --- tools/testing/selftests/bpf/progs/bpf_misc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/bpf/progs/bpf_misc.h b/tools/testing/selftests/bpf/progs/bpf_misc.h index 13a2e22f5465..863df7c0fdd0 100644 --- a/tools/testing/selftests/bpf/progs/bpf_misc.h +++ b/tools/testing/selftests/bpf/progs/bpf_misc.h @@ -221,7 +221,7 @@ #define CAN_USE_GOTOL #endif -#if _clang_major__ >= 18 +#if __clang_major__ >= 18 #define CAN_USE_BPF_ST #endif -- cgit v1.2.3 From fb8e9f59d6f292c3d9fea6c155c22ea5fc3053ab Mon Sep 17 00:00:00 2001 From: Yuli Wang Date: Thu, 24 Apr 2025 20:15:22 +0800 Subject: LoongArch: Select ARCH_USE_MEMTEST As of commit dce44566192e ("mm/memtest: add ARCH_USE_MEMTEST"), architectures must select ARCH_USE_MEMTESET to enable CONFIG_MEMTEST. Commit 628c3bb40e9a ("LoongArch: Add boot and setup routines") added support for early_memtest but did not select ARCH_USE_MEMTESET. Fixes: 628c3bb40e9a ("LoongArch: Add boot and setup routines") Tested-by: Erpeng Xu Tested-by: Yuli Wang Signed-off-by: Yuli Wang Signed-off-by: Huacai Chen --- arch/loongarch/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/Kconfig b/arch/loongarch/Kconfig index 067c0b994648..1a2cf012b8f2 100644 --- a/arch/loongarch/Kconfig +++ b/arch/loongarch/Kconfig @@ -73,6 +73,7 @@ config LOONGARCH select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_MEMTEST select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_DEFAULT_BPF_JIT -- cgit v1.2.3 From bb0511d59db9b3e40c8d51f0d151ccd0fd44071d Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 24 Apr 2025 20:15:41 +0800 Subject: LoongArch: Make regs_irqs_disabled() more clear In the current code, the definition of regs_irqs_disabled() is actually "!(regs->csr_prmd & CSR_CRMD_IE)" because arch_irqs_disabled_flags() is defined as "!(flags & CSR_CRMD_IE)", it looks a little strange. Define regs_irqs_disabled() as !(regs->csr_prmd & CSR_PRMD_PIE) directly to make it more clear, no functional change. While at it, the return value of regs_irqs_disabled() is true or false, so change its type to reflect that and also make it always inline. Fixes: 803b0fc5c3f2 ("LoongArch: Add process management") Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/ptrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index f3ddaed9ef7f..a5b63c84f854 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -33,9 +33,9 @@ struct pt_regs { unsigned long __last[]; } __aligned(8); -static inline int regs_irqs_disabled(struct pt_regs *regs) +static __always_inline bool regs_irqs_disabled(struct pt_regs *regs) { - return arch_irqs_disabled_flags(regs->csr_prmd); + return !(regs->csr_prmd & CSR_PRMD_PIE); } static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) -- cgit v1.2.3 From cc73cc6bcdb5f959670e3ff9abdc62461452ddff Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 24 Apr 2025 20:15:41 +0800 Subject: LoongArch: Make do_xyz() exception handlers more robust Currently, interrupts need to be disabled before single-step mode is set, it requires that CSR_PRMD_PIE be cleared in save_local_irqflag() which is called by setup_singlestep(), this is reasonable. But in the first kprobe breakpoint exception, if the irq is enabled at the beginning of do_bp(), it will not be disabled at the end of do_bp() due to the CSR_PRMD_PIE has been cleared in save_local_irqflag(). So for this case, it may corrupt exception context when restoring the exception after do_bp() in handle_bp(), this is not reasonable. In order to restore exception safely in handle_bp(), it needs to ensure the irq is disabled at the end of do_bp(), so just add a local variable to record the original interrupt status in the parent context, then use it as the check condition to enable and disable irq in do_bp(). While at it, do the similar thing for other do_xyz() exception handlers to make them more robust. Fixes: 6d4cc40fb5f5 ("LoongArch: Add kprobes support") Suggested-by: Jinyang He Suggested-by: Huacai Chen Co-developed-by: Tianyang Zhang Signed-off-by: Tianyang Zhang Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/kernel/traps.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/loongarch/kernel/traps.c b/arch/loongarch/kernel/traps.c index 2ec3106c0da3..47fc2de6d150 100644 --- a/arch/loongarch/kernel/traps.c +++ b/arch/loongarch/kernel/traps.c @@ -553,9 +553,10 @@ asmlinkage void noinstr do_ale(struct pt_regs *regs) die_if_kernel("Kernel ale access", regs); force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr); #else + bool pie = regs_irqs_disabled(regs); unsigned int *pc; - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); perf_sw_event(PERF_COUNT_SW_ALIGNMENT_FAULTS, 1, regs, regs->csr_badvaddr); @@ -582,7 +583,7 @@ sigbus: die_if_kernel("Kernel ale access", regs); force_sig_fault(SIGBUS, BUS_ADRALN, (void __user *)regs->csr_badvaddr); out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); #endif irqentry_exit(regs, state); @@ -621,12 +622,13 @@ static void bug_handler(struct pt_regs *regs) asmlinkage void noinstr do_bce(struct pt_regs *regs) { bool user = user_mode(regs); + bool pie = regs_irqs_disabled(regs); unsigned long era = exception_era(regs); u64 badv = 0, lower = 0, upper = ULONG_MAX; union loongarch_instruction insn; irqentry_state_t state = irqentry_enter(regs); - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); current->thread.trap_nr = read_csr_excode(); @@ -692,7 +694,7 @@ asmlinkage void noinstr do_bce(struct pt_regs *regs) force_sig_bnderr((void __user *)badv, (void __user *)lower, (void __user *)upper); out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); irqentry_exit(regs, state); @@ -710,11 +712,12 @@ bad_era: asmlinkage void noinstr do_bp(struct pt_regs *regs) { bool user = user_mode(regs); + bool pie = regs_irqs_disabled(regs); unsigned int opcode, bcode; unsigned long era = exception_era(regs); irqentry_state_t state = irqentry_enter(regs); - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); if (__get_inst(&opcode, (u32 *)era, user)) @@ -780,7 +783,7 @@ asmlinkage void noinstr do_bp(struct pt_regs *regs) } out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); irqentry_exit(regs, state); @@ -1015,6 +1018,7 @@ static void init_restore_lbt(void) asmlinkage void noinstr do_lbt(struct pt_regs *regs) { + bool pie = regs_irqs_disabled(regs); irqentry_state_t state = irqentry_enter(regs); /* @@ -1024,7 +1028,7 @@ asmlinkage void noinstr do_lbt(struct pt_regs *regs) * (including the user using 'MOVGR2GCSR' to turn on TM, which * will not trigger the BTE), we need to check PRMD first. */ - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_enable(); if (!cpu_has_lbt) { @@ -1038,7 +1042,7 @@ asmlinkage void noinstr do_lbt(struct pt_regs *regs) preempt_enable(); out: - if (regs->csr_prmd & CSR_PRMD_PIE) + if (!pie) local_irq_disable(); irqentry_exit(regs, state); -- cgit v1.2.3 From 2ef174b13344b3b4554d3d28e6f9e2a2c1d3138f Mon Sep 17 00:00:00 2001 From: Tiezhu Yang Date: Thu, 24 Apr 2025 20:15:41 +0800 Subject: LoongArch: Handle fp, lsx, lasx and lbt assembly symbols Like the other relevant symbols, export some fp, lsx, lasx and lbt assembly symbols and put the function declarations in header files rather than source files. While at it, use "asmlinkage" for the other existing C prototypes of assembly functions and also do not use the "extern" keyword with function declarations according to the document coding-style.rst. Cc: stable@vger.kernel.org # 6.6+ Signed-off-by: Tiezhu Yang Signed-off-by: Huacai Chen --- arch/loongarch/include/asm/fpu.h | 39 +++++++++++++++++++++++---------------- arch/loongarch/include/asm/lbt.h | 10 +++++++--- arch/loongarch/kernel/fpu.S | 6 ++++++ arch/loongarch/kernel/lbt.S | 4 ++++ arch/loongarch/kernel/signal.c | 21 --------------------- 5 files changed, 40 insertions(+), 40 deletions(-) diff --git a/arch/loongarch/include/asm/fpu.h b/arch/loongarch/include/asm/fpu.h index 3177674228f8..45514f314664 100644 --- a/arch/loongarch/include/asm/fpu.h +++ b/arch/loongarch/include/asm/fpu.h @@ -22,22 +22,29 @@ struct sigcontext; #define kernel_fpu_available() cpu_has_fpu -extern void kernel_fpu_begin(void); -extern void kernel_fpu_end(void); - -extern void _init_fpu(unsigned int); -extern void _save_fp(struct loongarch_fpu *); -extern void _restore_fp(struct loongarch_fpu *); - -extern void _save_lsx(struct loongarch_fpu *fpu); -extern void _restore_lsx(struct loongarch_fpu *fpu); -extern void _init_lsx_upper(void); -extern void _restore_lsx_upper(struct loongarch_fpu *fpu); - -extern void _save_lasx(struct loongarch_fpu *fpu); -extern void _restore_lasx(struct loongarch_fpu *fpu); -extern void _init_lasx_upper(void); -extern void _restore_lasx_upper(struct loongarch_fpu *fpu); + +void kernel_fpu_begin(void); +void kernel_fpu_end(void); + +asmlinkage void _init_fpu(unsigned int); +asmlinkage void _save_fp(struct loongarch_fpu *); +asmlinkage void _restore_fp(struct loongarch_fpu *); +asmlinkage int _save_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); +asmlinkage int _restore_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); + +asmlinkage void _save_lsx(struct loongarch_fpu *fpu); +asmlinkage void _restore_lsx(struct loongarch_fpu *fpu); +asmlinkage void _init_lsx_upper(void); +asmlinkage void _restore_lsx_upper(struct loongarch_fpu *fpu); +asmlinkage int _save_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); +asmlinkage int _restore_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); + +asmlinkage void _save_lasx(struct loongarch_fpu *fpu); +asmlinkage void _restore_lasx(struct loongarch_fpu *fpu); +asmlinkage void _init_lasx_upper(void); +asmlinkage void _restore_lasx_upper(struct loongarch_fpu *fpu); +asmlinkage int _save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); +asmlinkage int _restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); static inline void enable_lsx(void); static inline void disable_lsx(void); diff --git a/arch/loongarch/include/asm/lbt.h b/arch/loongarch/include/asm/lbt.h index e671978bf552..38566574e562 100644 --- a/arch/loongarch/include/asm/lbt.h +++ b/arch/loongarch/include/asm/lbt.h @@ -12,9 +12,13 @@ #include #include -extern void _init_lbt(void); -extern void _save_lbt(struct loongarch_lbt *); -extern void _restore_lbt(struct loongarch_lbt *); +asmlinkage void _init_lbt(void); +asmlinkage void _save_lbt(struct loongarch_lbt *); +asmlinkage void _restore_lbt(struct loongarch_lbt *); +asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags); +asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags); +asmlinkage int _save_ftop_context(void __user *ftop); +asmlinkage int _restore_ftop_context(void __user *ftop); static inline int is_lbt_enabled(void) { diff --git a/arch/loongarch/kernel/fpu.S b/arch/loongarch/kernel/fpu.S index 6ab640101457..28caf416ae36 100644 --- a/arch/loongarch/kernel/fpu.S +++ b/arch/loongarch/kernel/fpu.S @@ -458,6 +458,7 @@ SYM_FUNC_START(_save_fp_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_fp_context) +EXPORT_SYMBOL_GPL(_save_fp_context) /* * a0: fpregs @@ -471,6 +472,7 @@ SYM_FUNC_START(_restore_fp_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_fp_context) +EXPORT_SYMBOL_GPL(_restore_fp_context) /* * a0: fpregs @@ -484,6 +486,7 @@ SYM_FUNC_START(_save_lsx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_lsx_context) +EXPORT_SYMBOL_GPL(_save_lsx_context) /* * a0: fpregs @@ -497,6 +500,7 @@ SYM_FUNC_START(_restore_lsx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_lsx_context) +EXPORT_SYMBOL_GPL(_restore_lsx_context) /* * a0: fpregs @@ -510,6 +514,7 @@ SYM_FUNC_START(_save_lasx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_lasx_context) +EXPORT_SYMBOL_GPL(_save_lasx_context) /* * a0: fpregs @@ -523,6 +528,7 @@ SYM_FUNC_START(_restore_lasx_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_lasx_context) +EXPORT_SYMBOL_GPL(_restore_lasx_context) .L_fpu_fault: li.w a0, -EFAULT # failure diff --git a/arch/loongarch/kernel/lbt.S b/arch/loongarch/kernel/lbt.S index 001f061d226a..71678912d24c 100644 --- a/arch/loongarch/kernel/lbt.S +++ b/arch/loongarch/kernel/lbt.S @@ -90,6 +90,7 @@ SYM_FUNC_START(_save_lbt_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_lbt_context) +EXPORT_SYMBOL_GPL(_save_lbt_context) /* * a0: scr @@ -110,6 +111,7 @@ SYM_FUNC_START(_restore_lbt_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_lbt_context) +EXPORT_SYMBOL_GPL(_restore_lbt_context) /* * a0: ftop @@ -120,6 +122,7 @@ SYM_FUNC_START(_save_ftop_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_save_ftop_context) +EXPORT_SYMBOL_GPL(_save_ftop_context) /* * a0: ftop @@ -150,6 +153,7 @@ SYM_FUNC_START(_restore_ftop_context) li.w a0, 0 # success jr ra SYM_FUNC_END(_restore_ftop_context) +EXPORT_SYMBOL_GPL(_restore_ftop_context) .L_lbt_fault: li.w a0, -EFAULT # failure diff --git a/arch/loongarch/kernel/signal.c b/arch/loongarch/kernel/signal.c index 7a555b600171..4740cb5b2388 100644 --- a/arch/loongarch/kernel/signal.c +++ b/arch/loongarch/kernel/signal.c @@ -51,27 +51,6 @@ #define lock_lbt_owner() ({ preempt_disable(); pagefault_disable(); }) #define unlock_lbt_owner() ({ pagefault_enable(); preempt_enable(); }) -/* Assembly functions to move context to/from the FPU */ -extern asmlinkage int -_save_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); -extern asmlinkage int -_restore_fp_context(void __user *fpregs, void __user *fcc, void __user *csr); -extern asmlinkage int -_save_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); -extern asmlinkage int -_restore_lsx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); -extern asmlinkage int -_save_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); -extern asmlinkage int -_restore_lasx_context(void __user *fpregs, void __user *fcc, void __user *fcsr); - -#ifdef CONFIG_CPU_HAS_LBT -extern asmlinkage int _save_lbt_context(void __user *regs, void __user *eflags); -extern asmlinkage int _restore_lbt_context(void __user *regs, void __user *eflags); -extern asmlinkage int _save_ftop_context(void __user *ftop); -extern asmlinkage int _restore_ftop_context(void __user *ftop); -#endif - struct rt_sigframe { struct siginfo rs_info; struct ucontext rs_uctx; -- cgit v1.2.3 From c37325cbd91abe3bfab280b3b09947155abe8e07 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Thu, 24 Apr 2025 20:15:41 +0800 Subject: LoongArch: Remove a bogus reference to ZONE_DMA Remove dead code. LoongArch does not have a DMA memory zone (24bit DMA). The architecture does not even define MAX_DMA_PFN. Cc: stable@vger.kernel.org Reviewed-by: Mike Rapoport (Microsoft) Signed-off-by: Petr Tesarik Signed-off-by: Huacai Chen --- arch/loongarch/mm/init.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/loongarch/mm/init.c b/arch/loongarch/mm/init.c index fdb7f73ad160..06f11d9e4ec1 100644 --- a/arch/loongarch/mm/init.c +++ b/arch/loongarch/mm/init.c @@ -65,9 +65,6 @@ void __init paging_init(void) { unsigned long max_zone_pfns[MAX_NR_ZONES]; -#ifdef CONFIG_ZONE_DMA - max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; -#endif #ifdef CONFIG_ZONE_DMA32 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; #endif -- cgit v1.2.3 From bd51834d1cf65a2c801295d230c220aeebf87a73 Mon Sep 17 00:00:00 2001 From: Ming Wang Date: Thu, 24 Apr 2025 20:15:47 +0800 Subject: LoongArch: Return NULL from huge_pte_offset() for invalid PMD LoongArch's huge_pte_offset() currently returns a pointer to a PMD slot even if the underlying entry points to invalid_pte_table (indicating no mapping). Callers like smaps_hugetlb_range() fetch this invalid entry value (the address of invalid_pte_table) via this pointer. The generic is_swap_pte() check then incorrectly identifies this address as a swap entry on LoongArch, because it satisfies the "!pte_present() && !pte_none()" conditions. This misinterpretation, combined with a coincidental match by is_migration_entry() on the address bits, leads to kernel crashes in pfn_swap_entry_to_page(). Fix this at the architecture level by modifying huge_pte_offset() to check the PMD entry's content using pmd_none() before returning. If the entry is invalid (i.e., it points to invalid_pte_table), return NULL instead of the pointer to the slot. Cc: stable@vger.kernel.org Acked-by: Peter Xu Co-developed-by: Hongchen Zhang Signed-off-by: Hongchen Zhang Signed-off-by: Ming Wang Signed-off-by: Huacai Chen --- arch/loongarch/mm/hugetlbpage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/mm/hugetlbpage.c b/arch/loongarch/mm/hugetlbpage.c index e4068906143b..cea84d7f2b91 100644 --- a/arch/loongarch/mm/hugetlbpage.c +++ b/arch/loongarch/mm/hugetlbpage.c @@ -47,7 +47,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, pmd = pmd_offset(pud, addr); } } - return (pte_t *) pmd; + return pmd_none(pmdp_get(pmd)) ? NULL : (pte_t *) pmd; } uint64_t pmd_to_entrylo(unsigned long pmd_val) -- cgit v1.2.3 From 8b2d01fec800081dd68271c01e4d239ef4d7115e Mon Sep 17 00:00:00 2001 From: Yulong Han Date: Thu, 24 Apr 2025 20:15:52 +0800 Subject: LoongArch: KVM: Fix multiple typos of KVM code Fix multiple typos inside arch/loongarch/kvm. Cc: stable@vger.kernel.org Reviewed-by: Yuli Wang Reviewed-by: Bibo Mao Signed-off-by: Yulong Han Signed-off-by: Huacai Chen --- arch/loongarch/kvm/intc/ipi.c | 4 ++-- arch/loongarch/kvm/main.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/loongarch/kvm/intc/ipi.c b/arch/loongarch/kvm/intc/ipi.c index 93f4acd44523..fe734dc062ed 100644 --- a/arch/loongarch/kvm/intc/ipi.c +++ b/arch/loongarch/kvm/intc/ipi.c @@ -111,7 +111,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) ret = kvm_io_bus_read(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) { - kvm_err("%s: : read date from addr %llx failed\n", __func__, addr); + kvm_err("%s: : read data from addr %llx failed\n", __func__, addr); return ret; } /* Construct the mask by scanning the bit 27-30 */ @@ -127,7 +127,7 @@ static int send_ipi_data(struct kvm_vcpu *vcpu, gpa_t addr, uint64_t data) ret = kvm_io_bus_write(vcpu, KVM_IOCSR_BUS, addr, sizeof(val), &val); srcu_read_unlock(&vcpu->kvm->srcu, idx); if (unlikely(ret)) - kvm_err("%s: : write date to addr %llx failed\n", __func__, addr); + kvm_err("%s: : write data to addr %llx failed\n", __func__, addr); return ret; } diff --git a/arch/loongarch/kvm/main.c b/arch/loongarch/kvm/main.c index d165cd38c6bb..80ea63d465b8 100644 --- a/arch/loongarch/kvm/main.c +++ b/arch/loongarch/kvm/main.c @@ -296,10 +296,10 @@ int kvm_arch_enable_virtualization_cpu(void) /* * Enable virtualization features granting guest direct control of * certain features: - * GCI=2: Trap on init or unimplement cache instruction. + * GCI=2: Trap on init or unimplemented cache instruction. * TORU=0: Trap on Root Unimplement. * CACTRL=1: Root control cache. - * TOP=0: Trap on Previlege. + * TOP=0: Trap on Privilege. * TOE=0: Trap on Exception. * TIT=0: Trap on Timer. */ -- cgit v1.2.3 From 9ea86232a5520d9d21832d06031ea80f055a6ff8 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 24 Apr 2025 20:15:52 +0800 Subject: LoongArch: KVM: Fully clear some CSRs when VM reboot Some registers such as LOONGARCH_CSR_ESTAT and LOONGARCH_CSR_GINTC are partly cleared with function _kvm_setcsr(). This comes from the hardware specification, some bits are read only in VM mode, and however they can be written in host mode. So they are partly cleared in VM mode, and can be fully cleared in host mode. These read only bits show pending interrupt or exception status. When VM reset, the read-only bits should be cleared, otherwise vCPU will receive unknown interrupts in boot stage. Here registers LOONGARCH_CSR_ESTAT/LOONGARCH_CSR_GINTC are fully cleared in ioctl KVM_REG_LOONGARCH_VCPU_RESET vCPU reset path. Cc: stable@vger.kernel.org Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 8e427b379661..2d3c2a2d1d1c 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -902,6 +902,13 @@ static int kvm_set_one_reg(struct kvm_vcpu *vcpu, vcpu->arch.st.guest_addr = 0; memset(&vcpu->arch.irq_pending, 0, sizeof(vcpu->arch.irq_pending)); memset(&vcpu->arch.irq_clear, 0, sizeof(vcpu->arch.irq_clear)); + + /* + * When vCPU reset, clear the ESTAT and GINTC registers + * Other CSR registers are cleared with function _kvm_setcsr(). + */ + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_GINTC, 0); + kvm_write_sw_gcsr(vcpu->arch.csr, LOONGARCH_CSR_ESTAT, 0); break; default: ret = -EINVAL; -- cgit v1.2.3 From 5add0dbbebd60628b55e5eb8426612dedab7311a Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Thu, 24 Apr 2025 20:15:52 +0800 Subject: LoongArch: KVM: Fix PMU pass-through issue if VM exits to host finally In function kvm_pre_enter_guest(), it prepares to enter guest and check whether there are pending signals or events. And it will not enter guest if there are, PMU pass-through preparation for guest should be cancelled and host should own PMU hardware. Cc: stable@vger.kernel.org Fixes: f4e40ea9f78f ("LoongArch: KVM: Add PMU support for guest") Signed-off-by: Bibo Mao Signed-off-by: Huacai Chen --- arch/loongarch/kvm/vcpu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/loongarch/kvm/vcpu.c b/arch/loongarch/kvm/vcpu.c index 2d3c2a2d1d1c..5af32ec62cb1 100644 --- a/arch/loongarch/kvm/vcpu.c +++ b/arch/loongarch/kvm/vcpu.c @@ -294,6 +294,7 @@ static int kvm_pre_enter_guest(struct kvm_vcpu *vcpu) vcpu->arch.aux_inuse &= ~KVM_LARCH_SWCSR_LATEST; if (kvm_request_pending(vcpu) || xfer_to_guest_mode_work_pending()) { + kvm_lose_pmu(vcpu); /* make sure the vcpu mode has been written */ smp_store_mb(vcpu->mode, OUTSIDE_GUEST_MODE); local_irq_enable(); -- cgit v1.2.3 From 3318dc299b072a0511d6dfd8367f3304fb6d9827 Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 22 Apr 2025 17:16:16 +0100 Subject: irqchip/gic-v2m: Prevent use after free of gicv2m_get_fwnode() With ACPI in place, gicv2m_get_fwnode() is registered with the pci subsystem as pci_msi_get_fwnode_cb(), which may get invoked at runtime during a PCI host bridge probe. But, the call back is wrongly marked as __init, causing it to be freed, while being registered with the PCI subsystem and could trigger: Unable to handle kernel paging request at virtual address ffff8000816c0400 gicv2m_get_fwnode+0x0/0x58 (P) pci_set_bus_msi_domain+0x74/0x88 pci_register_host_bridge+0x194/0x548 This is easily reproducible on a Juno board with ACPI boot. Retain the function for later use. Fixes: 0644b3daca28 ("irqchip/gic-v2m: acpi: Introducing GICv2m ACPI support") Signed-off-by: Suzuki K Poulose Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar Reviewed-by: Marc Zyngier Cc: stable@vger.kernel.org --- drivers/irqchip/irq-gic-v2m.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-gic-v2m.c b/drivers/irqchip/irq-gic-v2m.c index c69894861866..dc98c39d2b20 100644 --- a/drivers/irqchip/irq-gic-v2m.c +++ b/drivers/irqchip/irq-gic-v2m.c @@ -421,7 +421,7 @@ static int __init gicv2m_of_init(struct fwnode_handle *parent_handle, #ifdef CONFIG_ACPI static int acpi_num_msi; -static __init struct fwnode_handle *gicv2m_get_fwnode(struct device *dev) +static struct fwnode_handle *gicv2m_get_fwnode(struct device *dev) { struct v2m_data *data; -- cgit v1.2.3 From bbce3de72be56e4b5f68924b7da9630cc89aa1a8 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 25 Apr 2025 01:51:24 -0700 Subject: sched/eevdf: Fix se->slice being set to U64_MAX and resulting crash There is a code path in dequeue_entities() that can set the slice of a sched_entity to U64_MAX, which sometimes results in a crash. The offending case is when dequeue_entities() is called to dequeue a delayed group entity, and then the entity's parent's dequeue is delayed. In that case: 1. In the if (entity_is_task(se)) else block at the beginning of dequeue_entities(), slice is set to cfs_rq_min_slice(group_cfs_rq(se)). If the entity was delayed, then it has no queued tasks, so cfs_rq_min_slice() returns U64_MAX. 2. The first for_each_sched_entity() loop dequeues the entity. 3. If the entity was its parent's only child, then the next iteration tries to dequeue the parent. 4. If the parent's dequeue needs to be delayed, then it breaks from the first for_each_sched_entity() loop _without updating slice_. 5. The second for_each_sched_entity() loop sets the parent's ->slice to the saved slice, which is still U64_MAX. This throws off subsequent calculations with potentially catastrophic results. A manifestation we saw in production was: 6. In update_entity_lag(), se->slice is used to calculate limit, which ends up as a huge negative number. 7. limit is used in se->vlag = clamp(vlag, -limit, limit). Because limit is negative, vlag > limit, so se->vlag is set to the same huge negative number. 8. In place_entity(), se->vlag is scaled, which overflows and results in another huge (positive or negative) number. 9. The adjusted lag is subtracted from se->vruntime, which increases or decreases se->vruntime by a huge number. 10. pick_eevdf() calls entity_eligible()/vruntime_eligible(), which incorrectly returns false because the vruntime is so far from the other vruntimes on the queue, causing the (vruntime - cfs_rq->min_vruntime) * load calulation to overflow. 11. Nothing appears to be eligible, so pick_eevdf() returns NULL. 12. pick_next_entity() tries to dereference the return value of pick_eevdf() and crashes. Dumping the cfs_rq states from the core dumps with drgn showed tell-tale huge vruntime ranges and bogus vlag values, and I also traced se->slice being set to U64_MAX on live systems (which was usually "benign" since the rest of the runqueue needed to be in a particular state to crash). Fix it in dequeue_entities() by always setting slice from the first non-empty cfs_rq. Fixes: aef6987d8954 ("sched/eevdf: Propagate min_slice up the cgroup hierarchy") Signed-off-by: Omar Sandoval Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar Link: https://lkml.kernel.org/r/f0c2d1072be229e1bdddc73c0703919a8b00c652.1745570998.git.osandov@fb.com --- kernel/sched/fair.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e43993a4e580..0fb9bf995a47 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7081,9 +7081,6 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) h_nr_idle = task_has_idle_policy(p); if (task_sleep || task_delayed || !se->sched_delayed) h_nr_runnable = 1; - } else { - cfs_rq = group_cfs_rq(se); - slice = cfs_rq_min_slice(cfs_rq); } for_each_sched_entity(se) { @@ -7093,6 +7090,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) if (p && &p->se == se) return -1; + slice = cfs_rq_min_slice(cfs_rq); break; } -- cgit v1.2.3 From 831e3f545b0771f91fa94cdb8aa569a73b9ec580 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 24 Apr 2025 09:27:35 -0400 Subject: Revert "sunrpc: clean cache_detail immediately when flush is written frequently" Ondrej reports that certain SELinux tests are failing after commit fc2a169c56de ("sunrpc: clean cache_detail immediately when flush is written frequently"), merged during the v6.15 merge window. Reported-by: Ondrej Mosnacek Fixes: fc2a169c56de ("sunrpc: clean cache_detail immediately when flush is written frequently") Signed-off-by: Chuck Lever --- net/sunrpc/cache.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 004cdb59f010..7ce5e28a6c03 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -1536,13 +1536,9 @@ static ssize_t write_flush(struct file *file, const char __user *buf, * or by one second if it has already reached the current time. * Newly added cache entries will always have ->last_refresh greater * that ->flush_time, so they don't get flushed prematurely. - * - * If someone frequently calls the flush interface, we should - * immediately clean the corresponding cache_detail instead of - * continuously accumulating nextcheck. */ - if (cd->flush_time >= now && cd->flush_time < (now + 5)) + if (cd->flush_time >= now) now = cd->flush_time + 1; cd->flush_time = now; -- cgit v1.2.3 From b4432656b36e5cc1d50a1f2dc15357543add530e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 27 Apr 2025 15:19:23 -0700 Subject: Linux 6.15-rc4 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 07f818186151..5aa9ee52a765 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 15 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Baby Opossum Posse # *DOCUMENTATION* -- cgit v1.2.3 From 8ed12ab1319b2d8e4a529504777aacacf71371e4 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Mon, 28 Apr 2025 19:43:22 +0200 Subject: x86/boot/sev: Support memory acceptance in the EFI stub under SVSM Commit: d54d610243a4 ("x86/boot/sev: Avoid shared GHCB page for early memory acceptance") provided a fix for SEV-SNP memory acceptance from the EFI stub when running at VMPL #0. However, that fix was insufficient for SVSM SEV-SNP guests running at VMPL >0, as those rely on a SVSM calling area, which is a shared buffer whose address is programmed into a SEV-SNP MSR, and the SEV init code that sets up this calling area executes much later during the boot. Given that booting via the EFI stub at VMPL >0 implies that the firmware has configured this calling area already, reuse it for performing memory acceptance in the EFI stub. Fixes: fcd042e86422 ("x86/sev: Perform PVALIDATE using the SVSM when not at VMPL0") Tested-by: Tom Lendacky Co-developed-by: Tom Lendacky Signed-off-by: Tom Lendacky Signed-off-by: Ard Biesheuvel Signed-off-by: Ingo Molnar Cc: Cc: Dionna Amalie Glaze Cc: Kevin Loughlin Cc: linux-efi@vger.kernel.org Link: https://lore.kernel.org/r/20250428174322.2780170-2-ardb+git@google.com --- arch/x86/boot/compressed/mem.c | 5 +---- arch/x86/boot/compressed/sev.c | 40 ++++++++++++++++++++++++++++++++++++++++ arch/x86/boot/compressed/sev.h | 2 ++ 3 files changed, 43 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/mem.c b/arch/x86/boot/compressed/mem.c index f676156d9f3d..0e9f84ab4bdc 100644 --- a/arch/x86/boot/compressed/mem.c +++ b/arch/x86/boot/compressed/mem.c @@ -34,14 +34,11 @@ static bool early_is_tdx_guest(void) void arch_accept_memory(phys_addr_t start, phys_addr_t end) { - static bool sevsnp; - /* Platform-specific memory-acceptance call goes here */ if (early_is_tdx_guest()) { if (!tdx_accept_memory(start, end)) panic("TDX: Failed to accept memory\n"); - } else if (sevsnp || (sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) { - sevsnp = true; + } else if (early_is_sevsnp_guest()) { snp_accept_memory(start, end); } else { error("Cannot accept memory: unknown platform\n"); diff --git a/arch/x86/boot/compressed/sev.c b/arch/x86/boot/compressed/sev.c index 89ba168f4f0f..0003e4416efd 100644 --- a/arch/x86/boot/compressed/sev.c +++ b/arch/x86/boot/compressed/sev.c @@ -645,3 +645,43 @@ void sev_prep_identity_maps(unsigned long top_level_pgt) sev_verify_cbit(top_level_pgt); } + +bool early_is_sevsnp_guest(void) +{ + static bool sevsnp; + + if (sevsnp) + return true; + + if (!(sev_get_status() & MSR_AMD64_SEV_SNP_ENABLED)) + return false; + + sevsnp = true; + + if (!snp_vmpl) { + unsigned int eax, ebx, ecx, edx; + + /* + * CPUID Fn8000_001F_EAX[28] - SVSM support + */ + eax = 0x8000001f; + ecx = 0; + native_cpuid(&eax, &ebx, &ecx, &edx); + if (eax & BIT(28)) { + struct msr m; + + /* Obtain the address of the calling area to use */ + boot_rdmsr(MSR_SVSM_CAA, &m); + boot_svsm_caa = (void *)m.q; + boot_svsm_caa_pa = m.q; + + /* + * The real VMPL level cannot be discovered, but the + * memory acceptance routines make no use of that so + * any non-zero value suffices here. + */ + snp_vmpl = U8_MAX; + } + } + return true; +} diff --git a/arch/x86/boot/compressed/sev.h b/arch/x86/boot/compressed/sev.h index 4e463f33186d..d3900384b8ab 100644 --- a/arch/x86/boot/compressed/sev.h +++ b/arch/x86/boot/compressed/sev.h @@ -13,12 +13,14 @@ bool sev_snp_enabled(void); void snp_accept_memory(phys_addr_t start, phys_addr_t end); u64 sev_get_status(void); +bool early_is_sevsnp_guest(void); #else static inline bool sev_snp_enabled(void) { return false; } static inline void snp_accept_memory(phys_addr_t start, phys_addr_t end) { } static inline u64 sev_get_status(void) { return 0; } +static inline bool early_is_sevsnp_guest(void) { return false; } #endif -- cgit v1.2.3