[pve-devel] [PATCH kernel 2/2] cherry-pick potential fix for NULL pointer deref with AMD Arcturus GPU during boot

Fiona Ebner f.ebner at proxmox.com
Wed Jul 10 13:37:09 CEST 2024


The issue was reported in the enterprise support and is handled by
Alexander Zeidler. It has the following trace [0] and causes an issue
with the networking down the line, because 'udevadm settle' would time
out. The customer reported that mainline kernel 6.9.3 booted fine.
Looking at the new commits, this one stood out, as it heavily modifies
the arcturus_get_power_limit() function. While not tagged for stable,
it seems straightforward enough and has a good chance to fix the
issue.

[0]:

> Jul 09 07:34:59 proxmox kernel: BUG: kernel NULL pointer dereference, address: 000000000000000f
> Jul 09 07:34:59 proxmox kernel: #PF: supervisor read access in kernel mode
> Jul 09 07:34:59 proxmox kernel: #PF: error_code(0x0000) - not-present page
> Jul 09 07:34:59 proxmox kernel: PGD 0 P4D 0
> Jul 09 07:34:59 proxmox kernel: Oops: 0000 [#1] PREEMPT SMP NOPTI
> Jul 09 07:34:59 proxmox kernel: CPU: 0 PID: 9 Comm: kworker/0:1 Tainted: P           O       6.8.8-2-pve #1
> Jul 09 07:34:59 proxmox kernel: Hardware name: Supermicro AS -4124GS-TNR-03-EB004/H12DSG-O-CPU, BIOS 2.7 09/21/2023
> Jul 09 07:34:59 proxmox kernel: Workqueue: events work_for_cpu_fn
> Jul 09 07:34:59 proxmox kernel: RIP: 0010:arcturus_get_power_limit+0xb5/0x1b0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel: Code: 24 48 85 d2 74 05 8b 45 cc 89 02 4d 85 ff 74 38 44 0f b6 a3 b8 06 00 00 41 80 fc 01 0f 87 81 d7 3d 00 48 8b 45 b0 41 83 e4 01 <0f> b6 40 0f 75 10 84 c0 74 14 45 8b bf 86 01 00 00 45 31 e4 eb 0e
> Jul 09 07:34:59 proxmox kernel: RSP: 0018:ffffaa42c029fc38 EFLAGS: 00010246
> Jul 09 07:34:59 proxmox kernel: RAX: 0000000000000000 RBX: ffff8d803362b000 RCX: 0000000000000000
> Jul 09 07:34:59 proxmox kernel: RDX: ffff8d803362b6c0 RSI: 0000000000000000 RDI: 0000000000000000
> Jul 09 07:34:59 proxmox kernel: RBP: ffffaa42c029fc88 R08: 0000000000000000 R09: ffffffffc177e1f0
> Jul 09 07:34:59 proxmox kernel: R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
> Jul 09 07:34:59 proxmox kernel: R13: ffff8d803362b6c8 R14: ffff8d803362b6c4 R15: ffff8d80424a1014
> Jul 09 07:34:59 proxmox kernel: FS:  0000000000000000(0000) GS:ffff8e7f0ae00000(0000) knlGS:0000000000000000
> Jul 09 07:34:59 proxmox kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
> Jul 09 07:34:59 proxmox kernel: CR2: 000000000000000f CR3: 0000006b24a36003 CR4: 0000000000f70ef0
> Jul 09 07:34:59 proxmox kernel: PKRU: 55555554
> Jul 09 07:34:59 proxmox kernel: Call Trace:
> Jul 09 07:34:59 proxmox kernel:  <TASK>
> Jul 09 07:34:59 proxmox kernel:  ? show_regs+0x6d/0x80
> Jul 09 07:34:59 proxmox kernel:  ? __die+0x24/0x80
> Jul 09 07:34:59 proxmox kernel:  ? page_fault_oops+0x176/0x500
> Jul 09 07:34:59 proxmox kernel:  ? do_user_addr_fault+0x2f9/0x6b0
> Jul 09 07:34:59 proxmox kernel:  ? exc_page_fault+0x83/0x1b0
> Jul 09 07:34:59 proxmox kernel:  ? asm_exc_page_fault+0x27/0x30
> Jul 09 07:34:59 proxmox kernel:  ? __pfx_arcturus_get_power_limit+0x10/0x10 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  ? arcturus_get_power_limit+0xb5/0x1b0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  ? arcturus_get_power_limit+0x62/0x1b0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  smu_late_init+0x16f/0x4d0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  amdgpu_device_ip_late_init+0x68/0x2a0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  amdgpu_device_init+0x242d/0x26e0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  ? srso_alias_return_thunk+0x5/0xfbef5
> Jul 09 07:34:59 proxmox kernel:  amdgpu_driver_load_kms+0x1a/0x1c0 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  amdgpu_pci_probe+0x195/0x520 [amdgpu]
> Jul 09 07:34:59 proxmox kernel:  local_pci_probe+0x47/0xb0
> Jul 09 07:34:59 proxmox kernel:  work_for_cpu_fn+0x1a/0x30
> Jul 09 07:34:59 proxmox kernel:  process_one_work+0x16d/0x350
> Jul 09 07:34:59 proxmox kernel:  worker_thread+0x306/0x440
> Jul 09 07:34:59 proxmox kernel:  ? __pfx_worker_thread+0x10/0x10
> Jul 09 07:34:59 proxmox kernel:  kthread+0xf2/0x120
> Jul 09 07:34:59 proxmox kernel:  ? __pfx_kthread+0x10/0x10
> Jul 09 07:34:59 proxmox kernel:  ret_from_fork+0x47/0x70
> Jul 09 07:34:59 proxmox kernel:  ? __pfx_kthread+0x10/0x10
> Jul 09 07:34:59 proxmox kernel:  ret_from_fork_asm+0x1b/0x30
> Jul 09 07:34:59 proxmox kernel:  </TASK>

Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
---
 ...pu-pm-Don-t-use-OD-table-on-Arcturus.patch | 69 +++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 patches/kernel/0019-drm-amdgpu-pm-Don-t-use-OD-table-on-Arcturus.patch

diff --git a/patches/kernel/0019-drm-amdgpu-pm-Don-t-use-OD-table-on-Arcturus.patch b/patches/kernel/0019-drm-amdgpu-pm-Don-t-use-OD-table-on-Arcturus.patch
new file mode 100644
index 0000000..cd88e43
--- /dev/null
+++ b/patches/kernel/0019-drm-amdgpu-pm-Don-t-use-OD-table-on-Arcturus.patch
@@ -0,0 +1,69 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Ma Jun <Jun.Ma2 at amd.com>
+Date: Tue, 19 Mar 2024 11:02:29 +0800
+Subject: [PATCH] drm/amdgpu/pm: Don't use OD table on Arcturus
+
+OD is not supported on Arcturus, so the OD table
+should not be used.
+
+Signed-off-by: Ma Jun <Jun.Ma2 at amd.com>
+Acked-by: Alex Deucher <alexander.deucher at amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
+(cherry picked from commit bc55c344b06f7e6f99eb92d393ff0a84c1532514)
+Signed-off-by: Fiona Ebner <f.ebner at proxmox.com>
+---
+ .../gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c | 33 +++----------------
+ 1 file changed, 5 insertions(+), 28 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+index 40ba7227cca5..0c2d04f978ac 100644
+--- a/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
++++ b/drivers/gpu/drm/amd/pm/swsmu/smu11/arcturus_ppt.c
+@@ -1283,11 +1283,8 @@ static int arcturus_get_power_limit(struct smu_context *smu,
+ 					uint32_t *max_power_limit,
+ 					uint32_t *min_power_limit)
+ {
+-	struct smu_11_0_powerplay_table *powerplay_table =
+-		(struct smu_11_0_powerplay_table *)smu->smu_table.power_play_table;
+-	struct smu_11_0_overdrive_table *od_settings = smu->od_settings;
+ 	PPTable_t *pptable = smu->smu_table.driver_pptable;
+-	uint32_t power_limit, od_percent_upper = 0, od_percent_lower = 0;
++	uint32_t power_limit;
+ 
+ 	if (smu_v11_0_get_current_power_limit(smu, &power_limit)) {
+ 		/* the last hope to figure out the ppt limit */
+@@ -1303,30 +1300,10 @@ static int arcturus_get_power_limit(struct smu_context *smu,
+ 		*current_power_limit = power_limit;
+ 	if (default_power_limit)
+ 		*default_power_limit = power_limit;
+-
+-	if (powerplay_table) {
+-		if (smu->od_enabled &&
+-				od_settings->cap[SMU_11_0_ODCAP_POWER_LIMIT]) {
+-			od_percent_upper = le32_to_cpu(powerplay_table->overdrive_table.max[SMU_11_0_ODSETTING_POWERPERCENTAGE]);
+-			od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]);
+-		} else if (od_settings->cap[SMU_11_0_ODCAP_POWER_LIMIT]) {
+-			od_percent_upper = 0;
+-			od_percent_lower = le32_to_cpu(powerplay_table->overdrive_table.min[SMU_11_0_ODSETTING_POWERPERCENTAGE]);
+-		}
+-	}
+-
+-	dev_dbg(smu->adev->dev, "od percent upper:%d, od percent lower:%d (default power: %d)\n",
+-							od_percent_upper, od_percent_lower, power_limit);
+-
+-	if (max_power_limit) {
+-		*max_power_limit = power_limit * (100 + od_percent_upper);
+-		*max_power_limit /= 100;
+-	}
+-
+-	if (min_power_limit) {
+-		*min_power_limit = power_limit * (100 - od_percent_lower);
+-		*min_power_limit /= 100;
+-	}
++	if (max_power_limit)
++		*max_power_limit = power_limit;
++	if (min_power_limit)
++		*min_power_limit = power_limit;
+ 
+ 	return 0;
+ }
-- 
2.39.2





More information about the pve-devel mailing list