当 memset_io 在 Petalinux 内核模块中的设备树内存区域上时出现 SError 内核恐慌

SError kernel panic when memset_io'ing on device-tree memory area in Petalinux kernel module

我的平台: ZynQ MP; PetaLinux 2020.2

构建系统: Ubuntu 18.04

我正在编写一个内核模块,它在启动时将自己注册为 platform_device,具有与 PetaLinux 内核兼容的字符串 'erika'。

我在 system-user.dtsi 文件中为此设备指定了两个内存区域(一个更大的内存区域 (16M) 用于保存将在 APU 上的 PetaLinux 和硬件上的裸机实现之间交换的数据) RPU 和一个较小的 (4k) 用于保存 APU 上的内核模块和 RPU 上的裸机应用程序之间共享的中断信号:

/include/ "system-conf.dtsi"
/{
    reserved-memory
    {
        #address-cells = <2>;
        #size-cells = <2>;
        ranges;
        rproc_0_reserved: rproc@0x3ed00000
        {
            no-map;
            reg = <0x0 0x3ed00000 0x0 0x2000000>;
        };
    };

    amba
    {
        /* Shared memory, IPI memory & interrupt for ERIKA module */
        shm0: shm@0
        {
            compatible = "erika,erika";
            /* Our erika kernel module will use the first memory range for SHM */
            /* and the second memory range for IPI flags */
            reg = <0x0 0x3ed80000 0x0 0x1000000
                       0x0 0xff340000 0x0 0x1000>;
            interrupt-parent = <&gic>;
            /* 'interrupts' vector - meaning: */
            /* 1st index: 0 --> SPI (shared peripheral interrupt) */
            /* 2nd index: Interrupt 29 => +32 (SPI) => 61 (HW IRQ#) = IPI_Ch7 */
            /* 3rd index: 4 --> level-triggered interrupt (active high) */
            /* acc. to Xilinx UG1085 (v2020.2), Table 13-1 */
            interrupts = <0 29 4>;
        };
    };
    zynqmp-rpu
    {
        compatible = "xlnx,zynqmp-r5-remoteproc-1.0";
        #address-cells = <2>;
        #size-cells = <2>;
        ranges;
        core_conf = "split";
        r5_0: r5@0
        {
            #address-cells = <2>;
            #size-cells = <2>;
            ranges;
            memory-region = <&rproc_0_reserved>;
            pnode-id = <0x7>;
            tcm_0_a: tcm_0@0
            {
                    reg = <0x0 0xFFE00000 0x0 0x10000>;
                    pnode-id = <0xf>;
            };
            tcm_0_b: tcm_0@1
            {
                    reg = <0x0 0xFFE20000 0x0 0x10000>;
                    pnode-id = <0x10>;
            };
        };
    };
};&ttc0
{
    compatible = "ttc0";
    status = "okay";
};

在模块的 probe() 函数中,我将两个内存区域映射为:

/* Get shared memory for the device */
r_mem_shm = platform_get_resource(nic->pdev, IORESOURCE_MEM, 0);
if (!r_mem_shm) {
   dev_err(dev, "invalid address\n");
   return -ENODEV;
}

/* Get IPI register memory for the device */
r_mem_ipi = platform_get_resource(nic->pdev, IORESOURCE_MEM, 1);
if (!r_mem_ipi) {
   dev_err(dev, "invalid address\n");
   return -ENODEV;
}

nic->base_addr_shm = devm_memremap(&pdev->dev, r_mem_shm->start, r_mem_shm->end - r_mem_shm->start + 1, MEMREMAP_WT);
if (IS_ERR(nic->base_addr_shm)) {
    dev_err(dev, "Could not map shared memory at 0x%08llx",
        (uint64_t __force)r_mem_shm->start);
    return PTR_ERR(nic->base_addr_shm);
}

nic->base_addr_ipi = devm_memremap(&pdev->dev, r_mem_ipi->start, r_mem_ipi->end - r_mem_ipi->start + 1, MEMREMAP_WT);
if (IS_ERR(nic->base_addr_ipi)) {
    dev_err(dev, "Could not map IPI memory at 0x%08llx",
        (uint64_t __force)r_mem_ipi->start);
    return PTR_ERR(nic->base_addr_ipi);
}

这成功了。稍后在函数中(成功获取 IRQ 后),我将这两个内存区域清零以获得干净的起始状态:

platform_set_drvdata(nic->pdev, nic);
nic->shm_start = r_mem_shm->start;
nic->shm_end = r_mem_shm->end;
nic->shm_size = nic->shm_end - nic->shm_start + 1;
nic->ipi_start = r_mem_ipi->start;
nic->ipi_end = r_mem_ipi->end;
nic->ipi_size = nic->ipi_end - nic->ipi_start + 1;

/* Clear shared memory & IPI memory */
dev_info(dev, "before memset_io(shm....), nic->shm_size = %d", nic->shm_size);
memset_io(nic->base_addr_shm, 0, nic->shm_size);
dev_info(dev, "before memset_io(ipi....), nic->ipi_size = %d", nic->ipi_size);
memset_io(nic->base_addr_ipi, 0, nic->ipi_size);
printk(KERN_NOTICE "after memset_io(nic->base_addr_ipi)");

根据我的调试打印判断,将 nic->base_addr_shm 处的内存区域清零成功,而第二个 memset_io 调用导致内核崩溃:

[    5.234907] erika: loading out-of-tree module taints kernel.
[    5.241470] <1>Hello world from erika module.
[    5.246068] erika 3ed80000.shm: Device Tree Probing
[    5.261139] erika 3ed80000.shm: erika shared memory at 0x3ed80000 mapped to 0xffff800015000000 with size 0x01000000
[    5.275169] erika 3ed80000.shm: erika IPI memory at 0xff340000 mapped to 0xffff80001006d000 with size 0x00001000
[    5.275577] zynqmp_r5_remoteproc zynqmp-rpu: RPU core_conf: split
[    5.285399] erika 3ed80000.shm: before memset_io(shm....), nic->shm_size = 16777216
[    5.293642]  r5@0: no mailboxes.
[    5.302430] remoteproc remoteproc0: r5@0 is available
[    5.319927] erika 3ed80000.shm: before memset_io(ipi....), nic->ipi_size = 4096
[    5.327430] SError Interrupt on CPU2, code 0xbf000002 -- SError
[    5.327434] CPU: 2 PID: 371 Comm: udevd Tainted: G           O      5.4.0-xilinx-v2020.2 #1
[    5.327435] Hardware name: xlnx,zynqmp (DT)
[    5.327437] pstate: 80000005 (Nzcv daif -PAN -UAO)
[    5.327438] pc : __memset_io+0x68/0x98
[    5.327440] lr : erika_probe+0x258/0x3bc [erika]
[    5.327441] sp : ffff800012d6b940
[    5.327443] x29: ffff800012d6b950 x28: 0000000000000100
[    5.327446] x27: ffff80001013f510 x26: 000000000000002d
[    5.327450] x25: ffff000877a98380 x24: ffff00087ab0d800
[    5.327453] x23: ffff800008c513f8 x22: ffff00087aaa7000
[    5.327456] x21: 0000000000000000 x20: ffff00087ab0d810
[    5.327459] x19: ffff00087aaa77c0 x18: 0000000000000010
[    5.327462] x17: 000000000f1828b4 x16: 00000000a67c5c83
[    5.327465] x15: ffff00087a04b2e8 x14: ffffffffffffffff
[    5.327469] x13: ffff800092d6b5b7 x12: ffff800012d6b5bf
[    5.327472] x11: ffff8000110f5000 x10: 0000000000000000
[    5.327475] x9 : ffff800011193000 x8 : 0000000000000152
[    5.327478] x7 : 0000000000000006 x6 : ffff8000111930f2
[    5.327481] x5 : 0000000000000003 x4 : 0000000000000000
[    5.327484] x3 : 0000000000000000 x2 : 0000000000001000
[    5.327488] x1 : ffff80001006e000 x0 : ffff80001006d0c8
[    5.327492] Kernel panic - not syncing: Asynchronous SError Interrupt
[    5.327494] CPU: 2 PID: 371 Comm: udevd Tainted: G           O      5.4.0-xilinx-v2020.2 #1
[    5.327496] Hardware name: xlnx,zynqmp (DT)
[    5.327497] Call trace:
[    5.327498]  dump_backtrace+0x0/0x140
[    5.327500]  show_stack+0x14/0x20
[    5.327501]  dump_stack+0xac/0xd0
[    5.327502]  panic+0x140/0x30c
[    5.327504]  __stack_chk_fail+0x0/0x18
[    5.327505]  arm64_serror_panic+0x74/0x80
[    5.327506]  do_serror+0x114/0x118
[    5.327508]  el1_error+0x84/0xf8
[    5.327509]  __memset_io+0x68/0x98
[    5.327510]  platform_drv_probe+0x50/0xa0
[    5.327511]  really_probe+0xd8/0x2f8
[    5.327513]  driver_probe_device+0x54/0xe8
[    5.327514]  device_driver_attach+0x6c/0x78
[    5.327516]  __driver_attach+0x54/0xd0
[    5.327517]  bus_for_each_dev+0x6c/0xc0
[    5.327518]  driver_attach+0x20/0x28
[    5.327520]  bus_add_driver+0x148/0x1e0
[    5.327521]  driver_register+0x60/0x110
[    5.327522]  __platform_driver_register+0x44/0x50
[    5.327524]  erika_init+0x28/0x1000 [erika]
[    5.327525]  do_one_initcall+0x50/0x190
[    5.327527]  do_init_module+0x50/0x1f0
[    5.327528]  load_module+0x1ca4/0x2218
[    5.327530]  __do_sys_finit_module+0xd0/0xe8
[    5.327531]  __arm64_sys_finit_module+0x1c/0x28
[    5.327533]  el0_svc_common.constprop.0+0x68/0x160
[    5.327534]  el0_svc_handler+0x6c/0x88
[    5.327535]  el0_svc+0x8/0xc
[    5.327548] SMP: stopping secondary CPUs
[    5.327549] Kernel Offset: disabled
[    5.327551] CPU features: 0x0002,20002004
[    5.327552] Memory Limit: none

我想我解决了这个问题(也感谢 0andriy 的评论):

我的错误是我处理了 IPI 寄存器 space

0x0 0xff340000 0x0 0x1000

作为原始内存而不是它的本来面目 - 使用通道 7 进行 IPI 通信的某些非常具体的寄存器所在的位置(参见 table 13-3 in https://www.xilinx.com/support/documentation/user_guides/ug1085-zynq-ultrascale-trm.pdf)。你不应该(并且显然不能不搞砸 [Peta]linux)简单地将这个区域清除为 0.

我现在改了代码如下,好像可以用了(至少系统现在可以正常启动了,我还得和我的同事确认我们现在是否真的可以进行IPI通信):

    /* Get shared memory for the device */
    r_mem_shm = platform_get_resource(nic->pdev, IORESOURCE_MEM, 0);
    if (!r_mem_shm) {
        dev_err(dev, "invalid address\n");
        return -ENODEV;
    }

    /* Get IPI register memory for the device */
    r_mem_ipi = platform_get_resource(nic->pdev, IORESOURCE_MEM, 1);
    if (!r_mem_ipi) {
        dev_err(dev, "invalid address\n");
        return -ENODEV;
    }

    nic->base_addr_shm = devm_memremap(&pdev->dev, r_mem_shm->start, r_mem_shm->end - r_mem_shm->start + 1, MEMREMAP_WB);
    if (IS_ERR(nic->base_addr_shm)) {
        dev_err(dev, "Could not map shared memory at %pR", r_mem_shm);
        return PTR_ERR(nic->base_addr_shm);
    }

    nic->base_addr_ipi = devm_ioremap_resource(&pdev->dev, r_mem_ipi);
    if (IS_ERR(nic->base_addr_ipi)) {
        dev_err(dev, "Could not map IPI memory at %pR", r_mem_ipi);
        return PTR_ERR(nic->base_addr_ipi);
    }

    /* Clear shared memory */
    memset(nic->base_addr_shm, 0, nic->shm_size);