当 memset_io 在 Petalinux 内核模块中的设备树内存区域上时出现 SError 内核恐慌
SError kernel panic when memset_io'ing on device-tree memory area in Petalinux kernel module
我的平台: ZynQ MP; PetaLinux 2020.2
构建系统: Ubuntu 18.04
我正在编写一个内核模块,它在启动时将自己注册为 platform_device,具有与 PetaLinux 内核兼容的字符串 'erika'。
我在 system-user.dtsi 文件中为此设备指定了两个内存区域(一个更大的内存区域 (16M) 用于保存将在 APU 上的 PetaLinux 和硬件上的裸机实现之间交换的数据) RPU 和一个较小的 (4k) 用于保存 APU 上的内核模块和 RPU 上的裸机应用程序之间共享的中断信号:
/include/ "system-conf.dtsi"
/{
reserved-memory
{
#address-cells = <2>;
#size-cells = <2>;
ranges;
rproc_0_reserved: rproc@0x3ed00000
{
no-map;
reg = <0x0 0x3ed00000 0x0 0x2000000>;
};
};
amba
{
/* Shared memory, IPI memory & interrupt for ERIKA module */
shm0: shm@0
{
compatible = "erika,erika";
/* Our erika kernel module will use the first memory range for SHM */
/* and the second memory range for IPI flags */
reg = <0x0 0x3ed80000 0x0 0x1000000
0x0 0xff340000 0x0 0x1000>;
interrupt-parent = <&gic>;
/* 'interrupts' vector - meaning: */
/* 1st index: 0 --> SPI (shared peripheral interrupt) */
/* 2nd index: Interrupt 29 => +32 (SPI) => 61 (HW IRQ#) = IPI_Ch7 */
/* 3rd index: 4 --> level-triggered interrupt (active high) */
/* acc. to Xilinx UG1085 (v2020.2), Table 13-1 */
interrupts = <0 29 4>;
};
};
zynqmp-rpu
{
compatible = "xlnx,zynqmp-r5-remoteproc-1.0";
#address-cells = <2>;
#size-cells = <2>;
ranges;
core_conf = "split";
r5_0: r5@0
{
#address-cells = <2>;
#size-cells = <2>;
ranges;
memory-region = <&rproc_0_reserved>;
pnode-id = <0x7>;
tcm_0_a: tcm_0@0
{
reg = <0x0 0xFFE00000 0x0 0x10000>;
pnode-id = <0xf>;
};
tcm_0_b: tcm_0@1
{
reg = <0x0 0xFFE20000 0x0 0x10000>;
pnode-id = <0x10>;
};
};
};
};&ttc0
{
compatible = "ttc0";
status = "okay";
};
在模块的 probe()
函数中,我将两个内存区域映射为:
/* Get shared memory for the device */
r_mem_shm = platform_get_resource(nic->pdev, IORESOURCE_MEM, 0);
if (!r_mem_shm) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
/* Get IPI register memory for the device */
r_mem_ipi = platform_get_resource(nic->pdev, IORESOURCE_MEM, 1);
if (!r_mem_ipi) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
nic->base_addr_shm = devm_memremap(&pdev->dev, r_mem_shm->start, r_mem_shm->end - r_mem_shm->start + 1, MEMREMAP_WT);
if (IS_ERR(nic->base_addr_shm)) {
dev_err(dev, "Could not map shared memory at 0x%08llx",
(uint64_t __force)r_mem_shm->start);
return PTR_ERR(nic->base_addr_shm);
}
nic->base_addr_ipi = devm_memremap(&pdev->dev, r_mem_ipi->start, r_mem_ipi->end - r_mem_ipi->start + 1, MEMREMAP_WT);
if (IS_ERR(nic->base_addr_ipi)) {
dev_err(dev, "Could not map IPI memory at 0x%08llx",
(uint64_t __force)r_mem_ipi->start);
return PTR_ERR(nic->base_addr_ipi);
}
这成功了。稍后在函数中(成功获取 IRQ 后),我将这两个内存区域清零以获得干净的起始状态:
platform_set_drvdata(nic->pdev, nic);
nic->shm_start = r_mem_shm->start;
nic->shm_end = r_mem_shm->end;
nic->shm_size = nic->shm_end - nic->shm_start + 1;
nic->ipi_start = r_mem_ipi->start;
nic->ipi_end = r_mem_ipi->end;
nic->ipi_size = nic->ipi_end - nic->ipi_start + 1;
/* Clear shared memory & IPI memory */
dev_info(dev, "before memset_io(shm....), nic->shm_size = %d", nic->shm_size);
memset_io(nic->base_addr_shm, 0, nic->shm_size);
dev_info(dev, "before memset_io(ipi....), nic->ipi_size = %d", nic->ipi_size);
memset_io(nic->base_addr_ipi, 0, nic->ipi_size);
printk(KERN_NOTICE "after memset_io(nic->base_addr_ipi)");
根据我的调试打印判断,将 nic->base_addr_shm
处的内存区域清零成功,而第二个 memset_io
调用导致内核崩溃:
[ 5.234907] erika: loading out-of-tree module taints kernel.
[ 5.241470] <1>Hello world from erika module.
[ 5.246068] erika 3ed80000.shm: Device Tree Probing
[ 5.261139] erika 3ed80000.shm: erika shared memory at 0x3ed80000 mapped to 0xffff800015000000 with size 0x01000000
[ 5.275169] erika 3ed80000.shm: erika IPI memory at 0xff340000 mapped to 0xffff80001006d000 with size 0x00001000
[ 5.275577] zynqmp_r5_remoteproc zynqmp-rpu: RPU core_conf: split
[ 5.285399] erika 3ed80000.shm: before memset_io(shm....), nic->shm_size = 16777216
[ 5.293642] r5@0: no mailboxes.
[ 5.302430] remoteproc remoteproc0: r5@0 is available
[ 5.319927] erika 3ed80000.shm: before memset_io(ipi....), nic->ipi_size = 4096
[ 5.327430] SError Interrupt on CPU2, code 0xbf000002 -- SError
[ 5.327434] CPU: 2 PID: 371 Comm: udevd Tainted: G O 5.4.0-xilinx-v2020.2 #1
[ 5.327435] Hardware name: xlnx,zynqmp (DT)
[ 5.327437] pstate: 80000005 (Nzcv daif -PAN -UAO)
[ 5.327438] pc : __memset_io+0x68/0x98
[ 5.327440] lr : erika_probe+0x258/0x3bc [erika]
[ 5.327441] sp : ffff800012d6b940
[ 5.327443] x29: ffff800012d6b950 x28: 0000000000000100
[ 5.327446] x27: ffff80001013f510 x26: 000000000000002d
[ 5.327450] x25: ffff000877a98380 x24: ffff00087ab0d800
[ 5.327453] x23: ffff800008c513f8 x22: ffff00087aaa7000
[ 5.327456] x21: 0000000000000000 x20: ffff00087ab0d810
[ 5.327459] x19: ffff00087aaa77c0 x18: 0000000000000010
[ 5.327462] x17: 000000000f1828b4 x16: 00000000a67c5c83
[ 5.327465] x15: ffff00087a04b2e8 x14: ffffffffffffffff
[ 5.327469] x13: ffff800092d6b5b7 x12: ffff800012d6b5bf
[ 5.327472] x11: ffff8000110f5000 x10: 0000000000000000
[ 5.327475] x9 : ffff800011193000 x8 : 0000000000000152
[ 5.327478] x7 : 0000000000000006 x6 : ffff8000111930f2
[ 5.327481] x5 : 0000000000000003 x4 : 0000000000000000
[ 5.327484] x3 : 0000000000000000 x2 : 0000000000001000
[ 5.327488] x1 : ffff80001006e000 x0 : ffff80001006d0c8
[ 5.327492] Kernel panic - not syncing: Asynchronous SError Interrupt
[ 5.327494] CPU: 2 PID: 371 Comm: udevd Tainted: G O 5.4.0-xilinx-v2020.2 #1
[ 5.327496] Hardware name: xlnx,zynqmp (DT)
[ 5.327497] Call trace:
[ 5.327498] dump_backtrace+0x0/0x140
[ 5.327500] show_stack+0x14/0x20
[ 5.327501] dump_stack+0xac/0xd0
[ 5.327502] panic+0x140/0x30c
[ 5.327504] __stack_chk_fail+0x0/0x18
[ 5.327505] arm64_serror_panic+0x74/0x80
[ 5.327506] do_serror+0x114/0x118
[ 5.327508] el1_error+0x84/0xf8
[ 5.327509] __memset_io+0x68/0x98
[ 5.327510] platform_drv_probe+0x50/0xa0
[ 5.327511] really_probe+0xd8/0x2f8
[ 5.327513] driver_probe_device+0x54/0xe8
[ 5.327514] device_driver_attach+0x6c/0x78
[ 5.327516] __driver_attach+0x54/0xd0
[ 5.327517] bus_for_each_dev+0x6c/0xc0
[ 5.327518] driver_attach+0x20/0x28
[ 5.327520] bus_add_driver+0x148/0x1e0
[ 5.327521] driver_register+0x60/0x110
[ 5.327522] __platform_driver_register+0x44/0x50
[ 5.327524] erika_init+0x28/0x1000 [erika]
[ 5.327525] do_one_initcall+0x50/0x190
[ 5.327527] do_init_module+0x50/0x1f0
[ 5.327528] load_module+0x1ca4/0x2218
[ 5.327530] __do_sys_finit_module+0xd0/0xe8
[ 5.327531] __arm64_sys_finit_module+0x1c/0x28
[ 5.327533] el0_svc_common.constprop.0+0x68/0x160
[ 5.327534] el0_svc_handler+0x6c/0x88
[ 5.327535] el0_svc+0x8/0xc
[ 5.327548] SMP: stopping secondary CPUs
[ 5.327549] Kernel Offset: disabled
[ 5.327551] CPU features: 0x0002,20002004
[ 5.327552] Memory Limit: none
我想我解决了这个问题(也感谢 0andriy 的评论):
我的错误是我处理了 IPI 寄存器 space
0x0 0xff340000 0x0 0x1000
作为原始内存而不是它的本来面目 - 使用通道 7 进行 IPI 通信的某些非常具体的寄存器所在的位置(参见 table 13-3 in https://www.xilinx.com/support/documentation/user_guides/ug1085-zynq-ultrascale-trm.pdf)。你不应该(并且显然不能不搞砸 [Peta]linux)简单地将这个区域清除为 0.
我现在改了代码如下,好像可以用了(至少系统现在可以正常启动了,我还得和我的同事确认我们现在是否真的可以进行IPI通信):
/* Get shared memory for the device */
r_mem_shm = platform_get_resource(nic->pdev, IORESOURCE_MEM, 0);
if (!r_mem_shm) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
/* Get IPI register memory for the device */
r_mem_ipi = platform_get_resource(nic->pdev, IORESOURCE_MEM, 1);
if (!r_mem_ipi) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
nic->base_addr_shm = devm_memremap(&pdev->dev, r_mem_shm->start, r_mem_shm->end - r_mem_shm->start + 1, MEMREMAP_WB);
if (IS_ERR(nic->base_addr_shm)) {
dev_err(dev, "Could not map shared memory at %pR", r_mem_shm);
return PTR_ERR(nic->base_addr_shm);
}
nic->base_addr_ipi = devm_ioremap_resource(&pdev->dev, r_mem_ipi);
if (IS_ERR(nic->base_addr_ipi)) {
dev_err(dev, "Could not map IPI memory at %pR", r_mem_ipi);
return PTR_ERR(nic->base_addr_ipi);
}
/* Clear shared memory */
memset(nic->base_addr_shm, 0, nic->shm_size);
我的平台: ZynQ MP; PetaLinux 2020.2
构建系统: Ubuntu 18.04
我正在编写一个内核模块,它在启动时将自己注册为 platform_device,具有与 PetaLinux 内核兼容的字符串 'erika'。
我在 system-user.dtsi 文件中为此设备指定了两个内存区域(一个更大的内存区域 (16M) 用于保存将在 APU 上的 PetaLinux 和硬件上的裸机实现之间交换的数据) RPU 和一个较小的 (4k) 用于保存 APU 上的内核模块和 RPU 上的裸机应用程序之间共享的中断信号:
/include/ "system-conf.dtsi"
/{
reserved-memory
{
#address-cells = <2>;
#size-cells = <2>;
ranges;
rproc_0_reserved: rproc@0x3ed00000
{
no-map;
reg = <0x0 0x3ed00000 0x0 0x2000000>;
};
};
amba
{
/* Shared memory, IPI memory & interrupt for ERIKA module */
shm0: shm@0
{
compatible = "erika,erika";
/* Our erika kernel module will use the first memory range for SHM */
/* and the second memory range for IPI flags */
reg = <0x0 0x3ed80000 0x0 0x1000000
0x0 0xff340000 0x0 0x1000>;
interrupt-parent = <&gic>;
/* 'interrupts' vector - meaning: */
/* 1st index: 0 --> SPI (shared peripheral interrupt) */
/* 2nd index: Interrupt 29 => +32 (SPI) => 61 (HW IRQ#) = IPI_Ch7 */
/* 3rd index: 4 --> level-triggered interrupt (active high) */
/* acc. to Xilinx UG1085 (v2020.2), Table 13-1 */
interrupts = <0 29 4>;
};
};
zynqmp-rpu
{
compatible = "xlnx,zynqmp-r5-remoteproc-1.0";
#address-cells = <2>;
#size-cells = <2>;
ranges;
core_conf = "split";
r5_0: r5@0
{
#address-cells = <2>;
#size-cells = <2>;
ranges;
memory-region = <&rproc_0_reserved>;
pnode-id = <0x7>;
tcm_0_a: tcm_0@0
{
reg = <0x0 0xFFE00000 0x0 0x10000>;
pnode-id = <0xf>;
};
tcm_0_b: tcm_0@1
{
reg = <0x0 0xFFE20000 0x0 0x10000>;
pnode-id = <0x10>;
};
};
};
};&ttc0
{
compatible = "ttc0";
status = "okay";
};
在模块的 probe()
函数中,我将两个内存区域映射为:
/* Get shared memory for the device */
r_mem_shm = platform_get_resource(nic->pdev, IORESOURCE_MEM, 0);
if (!r_mem_shm) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
/* Get IPI register memory for the device */
r_mem_ipi = platform_get_resource(nic->pdev, IORESOURCE_MEM, 1);
if (!r_mem_ipi) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
nic->base_addr_shm = devm_memremap(&pdev->dev, r_mem_shm->start, r_mem_shm->end - r_mem_shm->start + 1, MEMREMAP_WT);
if (IS_ERR(nic->base_addr_shm)) {
dev_err(dev, "Could not map shared memory at 0x%08llx",
(uint64_t __force)r_mem_shm->start);
return PTR_ERR(nic->base_addr_shm);
}
nic->base_addr_ipi = devm_memremap(&pdev->dev, r_mem_ipi->start, r_mem_ipi->end - r_mem_ipi->start + 1, MEMREMAP_WT);
if (IS_ERR(nic->base_addr_ipi)) {
dev_err(dev, "Could not map IPI memory at 0x%08llx",
(uint64_t __force)r_mem_ipi->start);
return PTR_ERR(nic->base_addr_ipi);
}
这成功了。稍后在函数中(成功获取 IRQ 后),我将这两个内存区域清零以获得干净的起始状态:
platform_set_drvdata(nic->pdev, nic);
nic->shm_start = r_mem_shm->start;
nic->shm_end = r_mem_shm->end;
nic->shm_size = nic->shm_end - nic->shm_start + 1;
nic->ipi_start = r_mem_ipi->start;
nic->ipi_end = r_mem_ipi->end;
nic->ipi_size = nic->ipi_end - nic->ipi_start + 1;
/* Clear shared memory & IPI memory */
dev_info(dev, "before memset_io(shm....), nic->shm_size = %d", nic->shm_size);
memset_io(nic->base_addr_shm, 0, nic->shm_size);
dev_info(dev, "before memset_io(ipi....), nic->ipi_size = %d", nic->ipi_size);
memset_io(nic->base_addr_ipi, 0, nic->ipi_size);
printk(KERN_NOTICE "after memset_io(nic->base_addr_ipi)");
根据我的调试打印判断,将 nic->base_addr_shm
处的内存区域清零成功,而第二个 memset_io
调用导致内核崩溃:
[ 5.234907] erika: loading out-of-tree module taints kernel.
[ 5.241470] <1>Hello world from erika module.
[ 5.246068] erika 3ed80000.shm: Device Tree Probing
[ 5.261139] erika 3ed80000.shm: erika shared memory at 0x3ed80000 mapped to 0xffff800015000000 with size 0x01000000
[ 5.275169] erika 3ed80000.shm: erika IPI memory at 0xff340000 mapped to 0xffff80001006d000 with size 0x00001000
[ 5.275577] zynqmp_r5_remoteproc zynqmp-rpu: RPU core_conf: split
[ 5.285399] erika 3ed80000.shm: before memset_io(shm....), nic->shm_size = 16777216
[ 5.293642] r5@0: no mailboxes.
[ 5.302430] remoteproc remoteproc0: r5@0 is available
[ 5.319927] erika 3ed80000.shm: before memset_io(ipi....), nic->ipi_size = 4096
[ 5.327430] SError Interrupt on CPU2, code 0xbf000002 -- SError
[ 5.327434] CPU: 2 PID: 371 Comm: udevd Tainted: G O 5.4.0-xilinx-v2020.2 #1
[ 5.327435] Hardware name: xlnx,zynqmp (DT)
[ 5.327437] pstate: 80000005 (Nzcv daif -PAN -UAO)
[ 5.327438] pc : __memset_io+0x68/0x98
[ 5.327440] lr : erika_probe+0x258/0x3bc [erika]
[ 5.327441] sp : ffff800012d6b940
[ 5.327443] x29: ffff800012d6b950 x28: 0000000000000100
[ 5.327446] x27: ffff80001013f510 x26: 000000000000002d
[ 5.327450] x25: ffff000877a98380 x24: ffff00087ab0d800
[ 5.327453] x23: ffff800008c513f8 x22: ffff00087aaa7000
[ 5.327456] x21: 0000000000000000 x20: ffff00087ab0d810
[ 5.327459] x19: ffff00087aaa77c0 x18: 0000000000000010
[ 5.327462] x17: 000000000f1828b4 x16: 00000000a67c5c83
[ 5.327465] x15: ffff00087a04b2e8 x14: ffffffffffffffff
[ 5.327469] x13: ffff800092d6b5b7 x12: ffff800012d6b5bf
[ 5.327472] x11: ffff8000110f5000 x10: 0000000000000000
[ 5.327475] x9 : ffff800011193000 x8 : 0000000000000152
[ 5.327478] x7 : 0000000000000006 x6 : ffff8000111930f2
[ 5.327481] x5 : 0000000000000003 x4 : 0000000000000000
[ 5.327484] x3 : 0000000000000000 x2 : 0000000000001000
[ 5.327488] x1 : ffff80001006e000 x0 : ffff80001006d0c8
[ 5.327492] Kernel panic - not syncing: Asynchronous SError Interrupt
[ 5.327494] CPU: 2 PID: 371 Comm: udevd Tainted: G O 5.4.0-xilinx-v2020.2 #1
[ 5.327496] Hardware name: xlnx,zynqmp (DT)
[ 5.327497] Call trace:
[ 5.327498] dump_backtrace+0x0/0x140
[ 5.327500] show_stack+0x14/0x20
[ 5.327501] dump_stack+0xac/0xd0
[ 5.327502] panic+0x140/0x30c
[ 5.327504] __stack_chk_fail+0x0/0x18
[ 5.327505] arm64_serror_panic+0x74/0x80
[ 5.327506] do_serror+0x114/0x118
[ 5.327508] el1_error+0x84/0xf8
[ 5.327509] __memset_io+0x68/0x98
[ 5.327510] platform_drv_probe+0x50/0xa0
[ 5.327511] really_probe+0xd8/0x2f8
[ 5.327513] driver_probe_device+0x54/0xe8
[ 5.327514] device_driver_attach+0x6c/0x78
[ 5.327516] __driver_attach+0x54/0xd0
[ 5.327517] bus_for_each_dev+0x6c/0xc0
[ 5.327518] driver_attach+0x20/0x28
[ 5.327520] bus_add_driver+0x148/0x1e0
[ 5.327521] driver_register+0x60/0x110
[ 5.327522] __platform_driver_register+0x44/0x50
[ 5.327524] erika_init+0x28/0x1000 [erika]
[ 5.327525] do_one_initcall+0x50/0x190
[ 5.327527] do_init_module+0x50/0x1f0
[ 5.327528] load_module+0x1ca4/0x2218
[ 5.327530] __do_sys_finit_module+0xd0/0xe8
[ 5.327531] __arm64_sys_finit_module+0x1c/0x28
[ 5.327533] el0_svc_common.constprop.0+0x68/0x160
[ 5.327534] el0_svc_handler+0x6c/0x88
[ 5.327535] el0_svc+0x8/0xc
[ 5.327548] SMP: stopping secondary CPUs
[ 5.327549] Kernel Offset: disabled
[ 5.327551] CPU features: 0x0002,20002004
[ 5.327552] Memory Limit: none
我想我解决了这个问题(也感谢 0andriy 的评论):
我的错误是我处理了 IPI 寄存器 space
0x0 0xff340000 0x0 0x1000
作为原始内存而不是它的本来面目 - 使用通道 7 进行 IPI 通信的某些非常具体的寄存器所在的位置(参见 table 13-3 in https://www.xilinx.com/support/documentation/user_guides/ug1085-zynq-ultrascale-trm.pdf)。你不应该(并且显然不能不搞砸 [Peta]linux)简单地将这个区域清除为 0.
我现在改了代码如下,好像可以用了(至少系统现在可以正常启动了,我还得和我的同事确认我们现在是否真的可以进行IPI通信):
/* Get shared memory for the device */
r_mem_shm = platform_get_resource(nic->pdev, IORESOURCE_MEM, 0);
if (!r_mem_shm) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
/* Get IPI register memory for the device */
r_mem_ipi = platform_get_resource(nic->pdev, IORESOURCE_MEM, 1);
if (!r_mem_ipi) {
dev_err(dev, "invalid address\n");
return -ENODEV;
}
nic->base_addr_shm = devm_memremap(&pdev->dev, r_mem_shm->start, r_mem_shm->end - r_mem_shm->start + 1, MEMREMAP_WB);
if (IS_ERR(nic->base_addr_shm)) {
dev_err(dev, "Could not map shared memory at %pR", r_mem_shm);
return PTR_ERR(nic->base_addr_shm);
}
nic->base_addr_ipi = devm_ioremap_resource(&pdev->dev, r_mem_ipi);
if (IS_ERR(nic->base_addr_ipi)) {
dev_err(dev, "Could not map IPI memory at %pR", r_mem_ipi);
return PTR_ERR(nic->base_addr_ipi);
}
/* Clear shared memory */
memset(nic->base_addr_shm, 0, nic->shm_size);