使用 dpdk 19 mellanox connectx5 进行软锁定

softlockup with dpdk 19 mellanox connectx5

我的服务器是centos 7.9 3.10.0-1160.53.1.el7.x86_64

当 运行 连接我的 dpdk 19 多进程应用程序时,我有 softlockup

服务器 i 运行 有 2 个 ixgbe 10G 和一个 100G connectx-5

/home/testpmd --no-affinity -l 1-62 -n 4 --proc-type primary --no-hpet -w 0000:03:00.0 -w 0000:03:00.1 -w 0000:87:00.0

如果 dpdk 不使用 mellanox 卡,则不会发生 softlockup

如果dpdk不使用多进程(没有-l 1-62),不会发生软锁

从 10G 开始,所有数据包都被丢弃(Rx imissed)

从 dmesg 登录

[ 2853.278031] NMI watchdog: BUG: soft lockup - CPU#63 stuck for 23s! [eal-intr-thread:6322]
[ 2853.278055] Modules linked in: igb_uio(OE) uio igb rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) kvm_amd kvm irqbypass crc32_pclmul ghash_clmulni_intel ipmi_ssif vfat fat xfs aesni_intel lrw gf128mul glue_helper ablk_helper libcrc32c cryptd joydev pcspkr hpilo hpwdt i2c_piix4 wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_cpufreq binfmt_misc ip_tables ext4 mbcache jbd2 mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm nvme drm_panel_orientation_quirks bnxt_en ixgbe mlx5_core(OE) mlxfw(OE) psample auxiliary(OE) mlx_compat(OE) devlink nvme_core mdio ptp pps_core dca dm_mirror dm_region_hash dm_log dm_mod
[ 2853.278096] CPU: 63 PID: 6322 Comm: eal-intr-thread Tainted: G           OE  ------------   3.10.0-1160.53.1.el7.x86_64 #1
[ 2853.278098] Hardware name: HPE ProLiant DL325 Gen10 Plus v2/ProLiant DL325 Gen10 Plus v2, BIOS A43 12/03/2021
[ 2853.278099] task: ffff8d8693b79080 ti: ffff8d86a50d8000 task.ti: ffff8d86a50d8000
[ 2853.278100] RIP: 0010:[<ffffffffab9fd43b>]  [<ffffffffab9fd43b>] iommu_unmap_page+0x2b/0x110
[ 2853.278106] RSP: 0000:ffff8d86a50db928  EFLAGS: 00000206
[ 2853.278106] RAX: 0000000000000000 RBX: 000000000000002d RCX: 0000000000000027
[ 2853.278107] RDX: 0000000000001000 RSI: 00000a00405eb000 RDI: ffff8d6f5dfcd000
[ 2853.278108] RBP: ffff8d86a50db960 R08: 0000000000000004 R09: 0000000000000000
[ 2853.278109] R10: 0000000000000001 R11: 000ffffffffff000 R12: 0000000000000216
[ 2853.278110] R13: ffff8d86a50db918 R14: ffffffffab4ad8b0 R15: ffff8d8700000025
[ 2853.278111] FS:  00007fffde1f4700(0000) GS:ffff8d877efc0000(0000) knlGS:0000000000000000
[ 2853.278112] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2853.278113] CR2: 00007fffe59589e0 CR3: 0000001711410000 CR4: 0000000000760ee0
[ 2853.278114] PKRU: 55555554
[ 2853.278115] Call Trace:
[ 2853.278119]  [<ffffffffab9ff893>] __unmap_single.isra.22+0x63/0x200
[ 2853.278121]  [<ffffffffaba00d9f>] unmap_sg+0x5f/0x70
[ 2853.278131]  [<ffffffffc0518014>] __ib_umem_release+0x134/0x160 [ib_core]
[ 2853.278136]  [<ffffffffc0518079>] ib_umem_release+0x39/0xd0 [ib_core]
[ 2853.278142]  [<ffffffffc055f8f8>] mlx5_ib_dereg_mr+0x1a8/0x4b0 [mlx5_ib]
[ 2853.278147]  [<ffffffffc04ed6a6>] ib_dereg_mr_user+0x46/0x80 [ib_core]
[ 2853.278151]  [<ffffffffc02971a5>] uverbs_free_mr+0x15/0x20 [ib_uverbs]
[ 2853.278154]  [<ffffffffc0293139>] destroy_hw_idr_uobject+0x19/0x20 [ib_uverbs]
[ 2853.278156]  [<ffffffffc029381d>] uverbs_destroy_uobject+0x3d/0x160 [ib_uverbs]
[ 2853.278159]  [<ffffffffc0293a02>] __uverbs_cleanup_ufile+0xc2/0x160 [ib_uverbs]
[ 2853.278162]  [<ffffffffc02941c3>] uverbs_destroy_ufile_hw+0x43/0x120 [ib_uverbs]
[ 2853.278164]  [<ffffffffc028a0e4>] ib_uverbs_close+0x24/0x70 [ib_uverbs]
[ 2853.278167]  [<ffffffffab65088c>] __fput+0xec/0x230
[ 2853.278168]  [<ffffffffab650abe>] ____fput+0xe/0x10
[ 2853.278171]  [<ffffffffab4c299b>] task_work_run+0xbb/0xe0
[ 2853.278173]  [<ffffffffab4a1954>] do_exit+0x2d4/0xa30
[ 2853.278177]  [<ffffffffabb88e30>] ? __schedule+0x320/0x680
[ 2853.278178]  [<ffffffffab4a212f>] do_group_exit+0x3f/0xa0
[ 2853.278180]  [<ffffffffab4b328e>] get_signal_to_deliver+0x1ce/0x5e0
[ 2853.278183]  [<ffffffffab42c527>] do_signal+0x57/0x6f0
[ 2853.278185]  [<ffffffffab69d87e>] ? ep_poll+0x31e/0x360
[ 2853.278188]  [<ffffffffab4dadf0>] ? wake_up_state+0x20/0x20
[ 2853.278189]  [<ffffffffab42cc32>] do_notify_resume+0x72/0xc0
[ 2853.278192]  [<ffffffffabb962ef>] int_signal+0x12/0x17
[ 2853.278193] Code: 66 66 66 66 90 55 48 89 e5 41 57 41 56 41 55 41 54 53 48 89 d3 48 83 ec 10 65 48 8b 04 25 28 00 00 00 48 89 45 d0 31 c0 48 85 d2 <0f> 84 be 00 00 00 48 8d 42 ff 48 85 d0 0f 85 b1 00 00 00 4c 8d
[ 2857.210797] NMI watchdog: BUG: soft lockup - CPU#16 stuck for 22s! [swapper/16:0]
[ 2857.210840] Modules linked in: igb_uio(OE) uio igb rdma_ucm(OE) rdma_cm(OE) iw_cm(OE) ib_ipoib(OE) ib_cm(OE) ib_umad(OE) kvm_amd kvm irqbypass crc32_pclmul ghash_clmulni_intel ipmi_ssif vfat fat xfs aesni_intel lrw gf128mul glue_helper ablk_helper libcrc32c cryptd joydev pcspkr hpilo hpwdt i2c_piix4 wmi ipmi_si ipmi_devintf ipmi_msghandler acpi_power_meter acpi_cpufreq binfmt_misc ip_tables ext4 mbcache jbd2 mlx5_ib(OE) ib_uverbs(OE) ib_core(OE) mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm crct10dif_pclmul crct10dif_common crc32c_intel drm nvme drm_panel_orientation_quirks bnxt_en ixgbe mlx5_core(OE) mlxfw(OE) psample auxiliary(OE) mlx_compat(OE) devlink nvme_core mdio ptp pps_core dca dm_mirror dm_region_hash dm_log dm_mod
[ 2857.210903] CPU: 16 PID: 0 Comm: swapper/16 Tainted: G           OEL ------------   3.10.0-1160.53.1.el7.x86_64 #1
[ 2857.210906] Hardware name: HPE ProLiant DL325 Gen10 Plus v2/ProLiant DL325 Gen10 Plus v2, BIOS A43 12/03/2021
[ 2857.210909] task: ffff8d6fecd96300 ti: ffff8d6fecdec000 task.ti: ffff8d6fecdec000
[ 2857.210911] RIP: 0010:[<ffffffffab4a4b97>]  [<ffffffffab4a4b97>] __do_softirq+0x97/0x280
[ 2857.210920] RSP: 0018:ffff8d775fa03f20  EFLAGS: 00000206
[ 2857.210922] RAX: ffff8d6fecdeffd8 RBX: ffff8d775fa15ad8 RCX: 0000000000000010
[ 2857.210923] RDX: 00000001002695e8 RSI: 00000000013e1d87 RDI: ffff8d6fecd96300
[ 2857.210925] RBP: ffff8d775fa03f80 R08: 00000293656b9f40 R09: ffff8d775fa03de0
[ 2857.210927] R10: 0000000000000004 R11: 0000000000000005 R12: ffff8d775fa03e98
[ 2857.210929] R13: ffffffffabb96fba R14: ffff8d775fa03f80 R15: ffff8d6fecdeffd8
[ 2857.210932] FS:  0000000000000000(0000) GS:ffff8d775fa00000(0000) knlGS:0000000000000000
[ 2857.210934] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 2857.210936] CR2: 00000000028d1478 CR3: 0000001711410000 CR4: 0000000000760ee0
[ 2857.210938] PKRU: 00000000
[ 2857.210939] Call Trace:
[ 2857.210942]  <IRQ>
[ 2857.210949]  [<ffffffffabb994ec>] call_softirq+0x1c/0x30
[ 2857.210954]  [<ffffffffab42f715>] do_softirq+0x65/0xa0
[ 2857.210958]  [<ffffffffab4a4f75>] irq_exit+0x105/0x110
[ 2857.210961]  [<ffffffffabb9aa28>] smp_apic_timer_interrupt+0x48/0x60
[ 2857.210964]  [<ffffffffabb96fba>] apic_timer_interrupt+0x16a/0x170
[ 2857.210966]  <EOI>
[ 2857.210971]  [<ffffffffab9c8dc4>] ? cpuidle_enter_state+0x54/0xd0
[ 2857.210974]  [<ffffffffab9c8f1e>] cpuidle_idle_call+0xde/0x230
[ 2857.210978]  [<ffffffffab437c8e>] arch_cpu_idle+0xe/0xc0
[ 2857.210983]  [<ffffffffab50181a>] cpu_startup_entry+0x14a/0x1e0
[ 2857.210988]  [<ffffffffab45a857>] start_secondary+0x1f7/0x270
[ 2857.210992]  [<ffffffffab4000d5>] start_cpu+0x5/0x14

添加 iommu=pt intel_iommu=on 到 grub 解决 softlockup 和 10 G Rx