From f3ed0a17f0292300b3caca32d823ecd32554a667 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Thu, 11 Jul 2013 09:50:30 -0700 Subject: Thermal: x86 package temp thermal crash On systems with no package MSR support this caused crash as there is a bug in the logic to check presence of DTHERM and PTS feature together. Added a change so that when there is no PTS support, module doesn't get loaded. Even if some CPU comes online with the PTS feature disabled, and other CPUs has this support, this patch will still prevent such MSR accesses. Signed-off-by: Srinivas Pandruvada Reported-by: Daniel Walker Signed-off-by: Zhang Rui --- drivers/thermal/x86_pkg_temp_thermal.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c index 5de56f671a9d..034604c021d6 100644 --- a/drivers/thermal/x86_pkg_temp_thermal.c +++ b/drivers/thermal/x86_pkg_temp_thermal.c @@ -511,7 +511,7 @@ static int get_core_online(unsigned int cpu) /* Check if there is already an instance for this package */ if (!phdev) { - if (!cpu_has(c, X86_FEATURE_DTHERM) && + if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS)) return -ENODEV; if (pkg_temp_thermal_device_add(cpu)) @@ -562,7 +562,7 @@ static struct notifier_block pkg_temp_thermal_notifier __refdata = { }; static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { - { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_DTHERM }, + { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS }, {} }; MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); -- cgit v1.2.3 From c7c1b3112e643d290471abd0f3b27ffeec6f28dd Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 18 Jun 2013 21:09:12 +0800 Subject: Thermal: x86_pkg_temp: fix krealloc() misuse in in pkg_temp_thermal_device_add() If krealloc() returns NULL, it doesn't free the original. So any code of the form 'foo = krealloc(foo, ...);' is almost certainly a bug. Signed-off-by: Wei Yongjun Signed-off-by: Zhang Rui --- drivers/thermal/x86_pkg_temp_thermal.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c index 034604c021d6..cf172d50b5d3 100644 --- a/drivers/thermal/x86_pkg_temp_thermal.c +++ b/drivers/thermal/x86_pkg_temp_thermal.c @@ -394,6 +394,7 @@ static int pkg_temp_thermal_device_add(unsigned int cpu) char buffer[30]; int thres_count; u32 eax, ebx, ecx, edx; + u8 *temp; cpuid(6, &eax, &ebx, &ecx, &edx); thres_count = ebx & 0x07; @@ -417,13 +418,14 @@ static int pkg_temp_thermal_device_add(unsigned int cpu) spin_lock(&pkg_work_lock); if (topology_physical_package_id(cpu) > max_phy_id) max_phy_id = topology_physical_package_id(cpu); - pkg_work_scheduled = krealloc(pkg_work_scheduled, - (max_phy_id+1) * sizeof(u8), GFP_ATOMIC); - if (!pkg_work_scheduled) { + temp = krealloc(pkg_work_scheduled, + (max_phy_id+1) * sizeof(u8), GFP_ATOMIC); + if (!temp) { spin_unlock(&pkg_work_lock); err = -ENOMEM; goto err_ret_free; } + pkg_work_scheduled = temp; pkg_work_scheduled[topology_physical_package_id(cpu)] = 0; spin_unlock(&pkg_work_lock); -- cgit v1.2.3 From 94e791f522595e2d5276e6d27a5b3b57f4e1cd8d Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 15 Jul 2013 15:38:52 -0700 Subject: Thermal: x86_pkg_temp: Limit number of pkg temp zones Although it is unlikley that physical package id is set to some arbitary number, it is better to prevent in anycase. Since package temp zones use this in thermal zone type and for allocation, added a limit. Signed-off-by: Srinivas Pandruvada Signed-off-by: Zhang Rui --- drivers/thermal/x86_pkg_temp_thermal.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c index cf172d50b5d3..810143a6aa33 100644 --- a/drivers/thermal/x86_pkg_temp_thermal.c +++ b/drivers/thermal/x86_pkg_temp_thermal.c @@ -54,6 +54,8 @@ MODULE_PARM_DESC(notify_delay_ms, * is some wrong values returned by cpuid for number of thresholds. */ #define MAX_NUMBER_OF_TRIPS 2 +/* Limit number of package temp zones */ +#define MAX_PKG_TEMP_ZONE_IDS 256 struct phy_dev_entry { struct list_head list; @@ -401,6 +403,9 @@ static int pkg_temp_thermal_device_add(unsigned int cpu) if (!thres_count) return -ENODEV; + if (topology_physical_package_id(cpu) > MAX_PKG_TEMP_ZONE_IDS) + return -ENODEV; + thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); err = get_tj_max(cpu, &tj_max); -- cgit v1.2.3 From ace120dcf23b3bbba00d797a898481997381052f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 16 Jul 2013 14:02:28 -0400 Subject: Thermal: Fix lockup of cpu_down() Commit f1a18a105 "Thermal: CPU Package temperature thermal" had code that did a get_online_cpus(), run a loop and then do a put_online_cpus(). The problem is that the loop had an error exit that would skip the put_online_cpus() part. In the error exit part of the function, it also did a get_online_cpus(), run a loop and then put_online_cpus(). The only way to get to the error exit part is with get_online_cpus() already performed. If this error condition is hit, the system will be prevented from taking CPUs offline. The process taking the CPU offline will lock up hard. Removing the get_online_cpus() removes the lockup as the hotplug CPU refcount is back to zero. This was bisected with ktest. Signed-off-by: Steven Rostedt Signed-off-by: Zhang Rui --- drivers/thermal/x86_pkg_temp_thermal.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c index 810143a6aa33..f36950e4134f 100644 --- a/drivers/thermal/x86_pkg_temp_thermal.c +++ b/drivers/thermal/x86_pkg_temp_thermal.c @@ -599,7 +599,6 @@ static int __init pkg_temp_thermal_init(void) return 0; err_ret: - get_online_cpus(); for_each_online_cpu(i) put_core_offline(i); put_online_cpus(); -- cgit v1.2.3