[pve-devel] [PATCH pve-kernel] revert 2 changes in thermal driver causing an early kernel Oops.

Stoiko Ivanov s.ivanov at proxmox.com
Fri Apr 5 11:27:02 CEST 2024


The second patch, that is reverted (first):
`thermal: trip: Drop lockdep assertion from thermal_zone_trip_id()`
only touches code introduced by the first patch.
The first patch causes the following Oops (reproduced on an old
HP DL380 G8):
```
[    2.960519] ACPI: button: Power Button [PWRF]
[    2.963126] BUG: kernel NULL pointer dereference, address: 000000000000000c
[    2.965667] #PF: supervisor read access in kernel mode
[    2.966954] #PF: error_code(0x0000) - not-present page
[    2.966954] PGD 0 P4D 0
[    2.966954] Oops: 0000 [#1] PREEMPT SMP PTI
[    2.966954] CPU: 0 PID: 1 Comm: swapper/0 Tainted: G          I        6.5.13-4-pve #1
[    2.966954] Hardware name: HP ProLiant DL380p Gen8, BIOS P70 05/24/2019
[    2.966954] RIP: 0010:step_wise_throttle+0x48/0x360
[    2.966954] Code: 04 25 28 00 00 00 48 89 45 d0 31 c0 48 63 c6 48 8d 14 40 48 8b 87 50 03 00 00 4c 8d 24 90 e8 cf d0 ff ff c6 45 bf 00 89 45 b4 <41> 8b 04 24 41 39 85 78 03 00 00 0f 8d a9 02 00 00 0f 1f 44 00 00
[    2.966954] RSP: 0000:ffff9e2b8014bae8 EFLAGS: 00010246
[    2.966954] RAX: 0000000000000002 RBX: 0000000000000001 RCX: 0000000000000000
[    2.966954] RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000000
[    2.966954] RBP: ffff9e2b8014bb40 R08: 0000000000000000 R09: 0000000000000000
[    2.966954] R10: 0000000000000000 R11: 0000000000000000 R12: 000000000000000c
[    2.966954] R13: ffff8c7ac421d000 R14: 0000000000000001 R15: 0000000000000000
[    2.966954] FS:  0000000000000000(0000) GS:ffff8c7def600000(0000) knlGS:0000000000000000
[    2.966954] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[    2.966954] CR2: 000000000000000c CR3: 0000000513a34001 CR4: 00000000000606f0
[    2.966954] Call Trace:
[    2.966954]  <TASK>
```

the relevant mainline kernels (6.6.15), corresponding to the
Ubuntu-patchset (which mixes changes from 6.6.15, with ones from
6.1.76) [0] - also boot happily - so I strongly assume that the
changes depend on one of the many commits introduced in linux-upstream
between v6.5.1 and v6.6.1.
As it looks like a refactoring (upon which later changes are based),
and not a bug-fix in itself - simply dropping it seems sensible.

Signed-off-by: Stoiko Ivanov <s.ivanov at proxmox.com>
---
 ...rip-Drop-lockdep-assertion-from-ther.patch |  24 ++
 ...ore-Store-trip-pointer-in-struct-the.patch | 343 ++++++++++++++++++
 2 files changed, 367 insertions(+)
 create mode 100644 patches/kernel/0014-Revert-thermal-trip-Drop-lockdep-assertion-from-ther.patch
 create mode 100644 patches/kernel/0015-Revert-thermal-core-Store-trip-pointer-in-struct-the.patch

diff --git a/patches/kernel/0014-Revert-thermal-trip-Drop-lockdep-assertion-from-ther.patch b/patches/kernel/0014-Revert-thermal-trip-Drop-lockdep-assertion-from-ther.patch
new file mode 100644
index 000000000000..413b1641a4b1
--- /dev/null
+++ b/patches/kernel/0014-Revert-thermal-trip-Drop-lockdep-assertion-from-ther.patch
@@ -0,0 +1,24 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stoiko Ivanov <s.ivanov at proxmox.com>
+Date: Thu, 4 Apr 2024 11:41:15 +0200
+Subject: [PATCH] Revert "thermal: trip: Drop lockdep assertion from
+ thermal_zone_trip_id()"
+
+This reverts commit c723c4fca6d2db3815623ff4dc0ea51667b56b89.
+---
+ drivers/thermal/thermal_trip.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
+index 68bea8706c597..1d4fe63e09f77 100644
+--- a/drivers/thermal/thermal_trip.c
++++ b/drivers/thermal/thermal_trip.c
+@@ -201,6 +201,8 @@ int thermal_zone_trip_id(struct thermal_zone_device *tz,
+ {
+ 	int i;
+ 
++	lockdep_assert_held(&tz->lock);
++
+ 	for (i = 0; i < tz->num_trips; i++) {
+ 		if (&tz->trips[i] == trip)
+ 			return i;
diff --git a/patches/kernel/0015-Revert-thermal-core-Store-trip-pointer-in-struct-the.patch b/patches/kernel/0015-Revert-thermal-core-Store-trip-pointer-in-struct-the.patch
new file mode 100644
index 000000000000..fe1ce3ed6632
--- /dev/null
+++ b/patches/kernel/0015-Revert-thermal-core-Store-trip-pointer-in-struct-the.patch
@@ -0,0 +1,343 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Stoiko Ivanov <s.ivanov at proxmox.com>
+Date: Thu, 4 Apr 2024 11:41:17 +0200
+Subject: [PATCH] Revert "thermal: core: Store trip pointer in struct
+ thermal_instance"
+
+This reverts commit 643b451957369f28b7770af387d14d4e4712074b.
+---
+ drivers/thermal/gov_bang_bang.c       | 23 +++++++++++++++--------
+ drivers/thermal/gov_fair_share.c      |  5 ++---
+ drivers/thermal/gov_power_allocator.c | 11 +++--------
+ drivers/thermal/gov_step_wise.c       | 16 +++++++++-------
+ drivers/thermal/thermal_core.c        | 15 +++++----------
+ drivers/thermal/thermal_core.h        |  4 +---
+ drivers/thermal/thermal_helpers.c     |  5 +----
+ drivers/thermal/thermal_sysfs.c       |  3 +--
+ drivers/thermal/thermal_trip.c        | 15 ---------------
+ 9 files changed, 37 insertions(+), 60 deletions(-)
+
+diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
+index 49cdfaa3a9279..1b121066521ff 100644
+--- a/drivers/thermal/gov_bang_bang.c
++++ b/drivers/thermal/gov_bang_bang.c
+@@ -13,21 +13,28 @@
+ 
+ #include "thermal_core.h"
+ 
+-static int thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_index)
++static int thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id)
+ {
+-	const struct thermal_trip *trip = &tz->trips[trip_index];
++	struct thermal_trip trip;
+ 	struct thermal_instance *instance;
++	int ret;
++
++	ret = __thermal_zone_get_trip(tz, trip_id, &trip);
++	if (ret) {
++		pr_warn_once("Failed to retrieve trip point %d\n", trip_id);
++		return ret;
++	}
+ 
+-	if (!trip->hysteresis)
++	if (!trip.hysteresis)
+ 		dev_info_once(&tz->device,
+ 			      "Zero hysteresis value for thermal zone %s\n", tz->type);
+ 
+ 	dev_dbg(&tz->device, "Trip%d[temp=%d]:temp=%d:hyst=%d\n",
+-				trip_index, trip->temperature, tz->temperature,
+-				trip->hysteresis);
++				trip_id, trip.temperature, tz->temperature,
++				trip.hysteresis);
+ 
+ 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+-		if (instance->trip != trip)
++		if (instance->trip != trip_id)
+ 			continue;
+ 
+ 		/* in case fan is in initial state, switch the fan off */
+@@ -45,10 +52,10 @@ static int thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_ind
+ 		 * enable fan when temperature exceeds trip_temp and disable
+ 		 * the fan in case it falls below trip_temp minus hysteresis
+ 		 */
+-		if (instance->target == 0 && tz->temperature >= trip->temperature)
++		if (instance->target == 0 && tz->temperature >= trip.temperature)
+ 			instance->target = 1;
+ 		else if (instance->target == 1 &&
+-			 tz->temperature <= trip->temperature - trip->hysteresis)
++			 tz->temperature <= trip.temperature - trip.hysteresis)
+ 			instance->target = 0;
+ 
+ 		dev_dbg(&instance->cdev->device, "target=%d\n",
+diff --git a/drivers/thermal/gov_fair_share.c b/drivers/thermal/gov_fair_share.c
+index 2abeb8979f500..03c2daeb6ee8b 100644
+--- a/drivers/thermal/gov_fair_share.c
++++ b/drivers/thermal/gov_fair_share.c
+@@ -49,7 +49,7 @@ static long get_target_state(struct thermal_zone_device *tz,
+ /**
+  * fair_share_throttle - throttles devices associated with the given zone
+  * @tz: thermal_zone_device
+- * @trip_index: trip point index
++ * @trip: trip point index
+  *
+  * Throttling Logic: This uses three parameters to calculate the new
+  * throttle state of the cooling devices associated with the given zone.
+@@ -65,9 +65,8 @@ static long get_target_state(struct thermal_zone_device *tz,
+  *	(Heavily assumes the trip points are in ascending order)
+  * new_state of cooling device = P3 * P2 * P1
+  */
+-static int fair_share_throttle(struct thermal_zone_device *tz, int trip_index)
++static int fair_share_throttle(struct thermal_zone_device *tz, int trip)
+ {
+-	const struct thermal_trip *trip = &tz->trips[trip_index];
+ 	struct thermal_instance *instance;
+ 	int total_weight = 0;
+ 	int total_instance = 0;
+diff --git a/drivers/thermal/gov_power_allocator.c b/drivers/thermal/gov_power_allocator.c
+index fc969642f70b7..fb311339bd08f 100644
+--- a/drivers/thermal/gov_power_allocator.c
++++ b/drivers/thermal/gov_power_allocator.c
+@@ -90,14 +90,12 @@ static u32 estimate_sustainable_power(struct thermal_zone_device *tz)
+ 	u32 sustainable_power = 0;
+ 	struct thermal_instance *instance;
+ 	struct power_allocator_params *params = tz->governor_data;
+-	const struct thermal_trip *trip_max_desired_temperature =
+-			&tz->trips[params->trip_max_desired_temperature];
+ 
+ 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+ 		struct thermal_cooling_device *cdev = instance->cdev;
+ 		u32 min_power;
+ 
+-		if (instance->trip != trip_max_desired_temperature)
++		if (instance->trip != params->trip_max_desired_temperature)
+ 			continue;
+ 
+ 		if (!cdev_is_power_actor(cdev))
+@@ -385,13 +383,12 @@ static int allocate_power(struct thermal_zone_device *tz,
+ {
+ 	struct thermal_instance *instance;
+ 	struct power_allocator_params *params = tz->governor_data;
+-	const struct thermal_trip *trip_max_desired_temperature =
+-			&tz->trips[params->trip_max_desired_temperature];
+ 	u32 *req_power, *max_power, *granted_power, *extra_actor_power;
+ 	u32 *weighted_req_power;
+ 	u32 total_req_power, max_allocatable_power, total_weighted_req_power;
+ 	u32 total_granted_power, power_range;
+ 	int i, num_actors, total_weight, ret = 0;
++	int trip_max_desired_temperature = params->trip_max_desired_temperature;
+ 
+ 	num_actors = 0;
+ 	total_weight = 0;
+@@ -567,14 +564,12 @@ static void allow_maximum_power(struct thermal_zone_device *tz, bool update)
+ {
+ 	struct thermal_instance *instance;
+ 	struct power_allocator_params *params = tz->governor_data;
+-	const struct thermal_trip *trip_max_desired_temperature =
+-			&tz->trips[params->trip_max_desired_temperature];
+ 	u32 req_power;
+ 
+ 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+ 		struct thermal_cooling_device *cdev = instance->cdev;
+ 
+-		if ((instance->trip != trip_max_desired_temperature) ||
++		if ((instance->trip != params->trip_max_desired_temperature) ||
+ 		    (!cdev_is_power_actor(instance->cdev)))
+ 			continue;
+ 
+diff --git a/drivers/thermal/gov_step_wise.c b/drivers/thermal/gov_step_wise.c
+index 849dc1ec8d27c..1050fb4d94c2d 100644
+--- a/drivers/thermal/gov_step_wise.c
++++ b/drivers/thermal/gov_step_wise.c
+@@ -81,24 +81,26 @@ static void update_passive_instance(struct thermal_zone_device *tz,
+ 
+ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id)
+ {
+-	const struct thermal_trip *trip = &tz->trips[trip_id];
+ 	enum thermal_trend trend;
+ 	struct thermal_instance *instance;
++	struct thermal_trip trip;
+ 	bool throttle = false;
+ 	int old_target;
+ 
++	__thermal_zone_get_trip(tz, trip_id, &trip);
++
+ 	trend = get_tz_trend(tz, trip_id);
+ 
+-	if (tz->temperature >= trip->temperature) {
++	if (tz->temperature >= trip.temperature) {
+ 		throttle = true;
+-		trace_thermal_zone_trip(tz, trip_id, trip->type);
++		trace_thermal_zone_trip(tz, trip_id, trip.type);
+ 	}
+ 
+ 	dev_dbg(&tz->device, "Trip%d[type=%d,temp=%d]:trend=%d,throttle=%d\n",
+-		trip_id, trip->type, trip->temperature, trend, throttle);
++				trip_id, trip.type, trip.temperature, trend, throttle);
+ 
+ 	list_for_each_entry(instance, &tz->thermal_instances, tz_node) {
+-		if (instance->trip != trip)
++		if (instance->trip != trip_id)
+ 			continue;
+ 
+ 		old_target = instance->target;
+@@ -112,11 +114,11 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip_id
+ 		/* Activate a passive thermal instance */
+ 		if (old_target == THERMAL_NO_TARGET &&
+ 			instance->target != THERMAL_NO_TARGET)
+-			update_passive_instance(tz, trip->type, 1);
++			update_passive_instance(tz, trip.type, 1);
+ 		/* Deactivate a passive thermal instance */
+ 		else if (old_target != THERMAL_NO_TARGET &&
+ 			instance->target == THERMAL_NO_TARGET)
+-			update_passive_instance(tz, trip->type, -1);
++			update_passive_instance(tz, trip.type, -1);
+ 
+ 		instance->initialized = true;
+ 		mutex_lock(&instance->cdev->lock);
+diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
+index c066c09555667..69cff5fc32156 100644
+--- a/drivers/thermal/thermal_core.c
++++ b/drivers/thermal/thermal_core.c
+@@ -582,7 +582,7 @@ struct thermal_zone_device *thermal_zone_get_by_id(int id)
+ /**
+  * thermal_zone_bind_cooling_device() - bind a cooling device to a thermal zone
+  * @tz:		pointer to struct thermal_zone_device
+- * @trip_index:	indicates which trip point the cooling devices is
++ * @trip:	indicates which trip point the cooling devices is
+  *		associated with in this thermal zone.
+  * @cdev:	pointer to struct thermal_cooling_device
+  * @upper:	the Maximum cooling state for this trip point.
+@@ -602,7 +602,7 @@ struct thermal_zone_device *thermal_zone_get_by_id(int id)
+  * Return: 0 on success, the proper error value otherwise.
+  */
+ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+-				     int trip_index,
++				     int trip,
+ 				     struct thermal_cooling_device *cdev,
+ 				     unsigned long upper, unsigned long lower,
+ 				     unsigned int weight)
+@@ -611,15 +611,12 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz,
+ 	struct thermal_instance *pos;
+ 	struct thermal_zone_device *pos1;
+ 	struct thermal_cooling_device *pos2;
+-	const struct thermal_trip *trip;
+ 	bool upper_no_limit;
+ 	int result;
+ 
+-	if (trip_index >= tz->num_trips || trip_index < 0)
++	if (trip >= tz->num_trips || trip < 0)
+ 		return -EINVAL;
+ 
+-	trip = &tz->trips[trip_index];
+-
+ 	list_for_each_entry(pos1, &thermal_tz_list, node) {
+ 		if (pos1 == tz)
+ 			break;
+@@ -724,7 +721,7 @@ EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device);
+  * thermal_zone_unbind_cooling_device() - unbind a cooling device from a
+  *					  thermal zone.
+  * @tz:		pointer to a struct thermal_zone_device.
+- * @trip_index:	indicates which trip point the cooling devices is
++ * @trip:	indicates which trip point the cooling devices is
+  *		associated with in this thermal zone.
+  * @cdev:	pointer to a struct thermal_cooling_device.
+  *
+@@ -735,15 +732,13 @@ EXPORT_SYMBOL_GPL(thermal_zone_bind_cooling_device);
+  * Return: 0 on success, the proper error value otherwise.
+  */
+ int thermal_zone_unbind_cooling_device(struct thermal_zone_device *tz,
+-				       int trip_index,
++				       int trip,
+ 				       struct thermal_cooling_device *cdev)
+ {
+ 	struct thermal_instance *pos, *next;
+-	const struct thermal_trip *trip;
+ 
+ 	mutex_lock(&tz->lock);
+ 	mutex_lock(&cdev->lock);
+-	trip = &tz->trips[trip_index];
+ 	list_for_each_entry_safe(pos, next, &tz->thermal_instances, tz_node) {
+ 		if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
+ 			list_del(&pos->tz_node);
+diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h
+index a33b389bbcfe8..17c1bbed734d3 100644
+--- a/drivers/thermal/thermal_core.h
++++ b/drivers/thermal/thermal_core.h
+@@ -91,7 +91,7 @@ struct thermal_instance {
+ 	char name[THERMAL_NAME_LENGTH];
+ 	struct thermal_zone_device *tz;
+ 	struct thermal_cooling_device *cdev;
+-	const struct thermal_trip *trip;
++	int trip;
+ 	bool initialized;
+ 	unsigned long upper;	/* Highest cooling state for this trip point */
+ 	unsigned long lower;	/* Lowest cooling state for this trip point */
+@@ -123,8 +123,6 @@ void __thermal_zone_device_update(struct thermal_zone_device *tz,
+ void __thermal_zone_set_trips(struct thermal_zone_device *tz);
+ int __thermal_zone_get_trip(struct thermal_zone_device *tz, int trip_id,
+ 			    struct thermal_trip *trip);
+-int thermal_zone_trip_id(struct thermal_zone_device *tz,
+-			 const struct thermal_trip *trip);
+ int __thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
+ 
+ /* sysfs I/F */
+diff --git a/drivers/thermal/thermal_helpers.c b/drivers/thermal/thermal_helpers.c
+index 421ed301541e1..cfba0965a22da 100644
+--- a/drivers/thermal/thermal_helpers.c
++++ b/drivers/thermal/thermal_helpers.c
+@@ -41,17 +41,14 @@ int get_tz_trend(struct thermal_zone_device *tz, int trip)
+ 
+ struct thermal_instance *
+ get_thermal_instance(struct thermal_zone_device *tz,
+-		     struct thermal_cooling_device *cdev, int trip_index)
++		     struct thermal_cooling_device *cdev, int trip)
+ {
+ 	struct thermal_instance *pos = NULL;
+ 	struct thermal_instance *target_instance = NULL;
+-	const struct thermal_trip *trip;
+ 
+ 	mutex_lock(&tz->lock);
+ 	mutex_lock(&cdev->lock);
+ 
+-	trip = &tz->trips[trip_index];
+-
+ 	list_for_each_entry(pos, &tz->thermal_instances, tz_node) {
+ 		if (pos->tz == tz && pos->trip == trip && pos->cdev == cdev) {
+ 			target_instance = pos;
+diff --git a/drivers/thermal/thermal_sysfs.c b/drivers/thermal/thermal_sysfs.c
+index eef40d4f30639..4e6a97db894e9 100644
+--- a/drivers/thermal/thermal_sysfs.c
++++ b/drivers/thermal/thermal_sysfs.c
+@@ -943,8 +943,7 @@ trip_point_show(struct device *dev, struct device_attribute *attr, char *buf)
+ 	instance =
+ 	    container_of(attr, struct thermal_instance, attr);
+ 
+-	return sprintf(buf, "%d\n",
+-		       thermal_zone_trip_id(instance->tz, instance->trip));
++	return sprintf(buf, "%d\n", instance->trip);
+ }
+ 
+ ssize_t
+diff --git a/drivers/thermal/thermal_trip.c b/drivers/thermal/thermal_trip.c
+index 1d4fe63e09f77..21736e02fa360 100644
+--- a/drivers/thermal/thermal_trip.c
++++ b/drivers/thermal/thermal_trip.c
+@@ -195,18 +195,3 @@ int thermal_zone_set_trip(struct thermal_zone_device *tz, int trip_id,
+ 
+ 	return 0;
+ }
+-
+-int thermal_zone_trip_id(struct thermal_zone_device *tz,
+-			 const struct thermal_trip *trip)
+-{
+-	int i;
+-
+-	lockdep_assert_held(&tz->lock);
+-
+-	for (i = 0; i < tz->num_trips; i++) {
+-		if (&tz->trips[i] == trip)
+-			return i;
+-	}
+-
+-	return -ENODATA;
+-}
-- 
2.39.2





More information about the pve-devel mailing list