原子调用 netlink_broadcast() 导致 BUG 调度
Calling netlink_broadcast() cause BUG scheduling while atomic
我正在为硬件 phone 模块开发自定义 dahdi 驱动程序。该模块通过 usb(用户空间 libusb D2XX ftdi 驱动程序)连接到 pc。所以驱动程序被分成两部分:内核模式 dahdi 驱动程序和用户空间服务。
用户空间守护进程检测到 usb 卡并要求内核 dahdi 驱动程序使用 netlink 到 add/remove dahdi 跨度。添加新跨度后,它可以通信设备、send/receive 音频和命令。
我通过单张 phone 卡成功测试(热插拔设备、拨号、通话、热移除没有任何错误),开始多张卡测试并在 netlink_broadcast() 获得 "BUG: scheduling while atomic"通话:
[ 5322.363190] [<ffffffffae709113>] __schedule_bug+0x64/0x72
[ 5322.363196] [<ffffffffae713fdb>] __schedule+0x9fb/0xa20
[ 5322.363202] [<ffffffffae0cc1d6>] __cond_resched+0x26/0x30
[ 5322.363205] [<ffffffffae7142ca>] _cond_resched+0x3a/0x50
[ 5322.363209] [<ffffffffae1f7a35>] kmem_cache_alloc+0x35/0x1f0
[ 5322.363216] [<ffffffffae5d7ad9>] ? skb_clone+0x49/0xb0
[ 5322.363218] [<ffffffffae5d7ad9>] skb_clone+0x49/0xb0
[ 5322.363225] [<ffffffffae623ca1>] netlink_broadcast_filtered+0x331/0x3e0
[ 5322.363227] [<ffffffffae623d6d>] netlink_broadcast+0x1d/0x20
[ 5322.363231] [<ffffffffc0c113ca>] nl_send_cmd+0x15a/0x260 [mydev]
[ 5322.363234] [<ffffffffc0c11531>] mydev_hooksig+0x61/0x80 [mydev]
[ 5322.363240] [<ffffffffc082ecdb>] dahdi_rbs_sethook+0x9b/0x220 [dahdi]
[ 5322.363244] [<ffffffffc0833566>] _dahdi_transmit+0x4c6/0x5b0 [dahdi]
[ 5322.363248] [<ffffffffc08368f5>] ? _dahdi_receive+0x235/0x3a0 [dahdi]
[ 5322.363250] [<ffffffffc0c10436>] ? queue_write+0x66/0xd0 [mydev]
[ 5322.363252] [<ffffffffc0c10963>] mydev_tick+0x1e3/0x2b0 [mydev]
[ 5322.363256] [<ffffffffc08365fe>] _process_masterspan+0x5be/0x680 [dahdi]
[ 5322.363259] [<ffffffffc0836a1c>] _dahdi_receive+0x35c/0x3a0 [dahdi]
[ 5322.363263] [<ffffffffc07b28b7>] g4_interrupt+0x3b7/0xc7b [opvxg4xx]
[ 5322.363266] [<ffffffffc08332da>] ? _dahdi_transmit+0x23a/0x5b0 [dahdi]
[ 5322.363270] [<ffffffffc08368f5>] ? _dahdi_receive+0x235/0x3a0 [dahdi]
[ 5322.363274] [<ffffffffae141284>] __handle_irq_event_percpu+0x44/0x1c0
[ 5322.363276] [<ffffffffae141432>] handle_irq_event_percpu+0x32/0x80
[ 5322.363277] [<ffffffffae1414bc>] handle_irq_event+0x3c/0x60
[ 5322.363281] [<ffffffffae144ab9>] handle_fasteoi_irq+0x59/0x110
[ 5322.363285] [<ffffffffae02d504>] handle_irq+0xe4/0x1a0
[ 5322.363290] [<ffffffffae1029fc>] ? tick_check_idle+0x8c/0xd0
内核驱动结构非常简单。它使用回调列表注册 dahdi 设备:
static const struct dahdi_span_ops mydev_span_ops = {
.owner = THIS_MODULE,
.hooksig = mydev_hooksig,
.spanconfig = mydev_spanconfig,
.chanconfig = mydev_chanconfig,
.startup = mydev_startup,
.shutdown = mydev_shutdown,
.open = mydev_open,
.close = mydev_close,
.ioctl = mydev_ioctl,
.sync_tick = mydev_tick,
};
hooksig 操作从 handoff/hangon 上的 dahdi 模块调用,它只是将 netlink 消息发送到用户空间守护程序:
static int mydev_hooksig(struct dahdi_chan *chan, enum dahdi_txsig txsig)
{
struct mydevp *wc = NULL;
int reg=0;
if (chan == NULL) return 0;
wc = chan->pvt;
switch(txsig) {
case DAHDI_TXSIG_START:
case DAHDI_TXSIG_OFFHOOK:
nl_send_cmd(chan->chanpos, wc->serial, "off", OP_OFFHOOK);
break;
case DAHDI_TXSIG_ONHOOK:
nl_send_cmd(chan->chanpos, wc->serial, "on", OP_ONHOOK);
break;
default:
printk(KERN_NOTICE "dahdi_mydev: Can't set tx state to unknown %d\n", txsig);
}
printk(KERN_DEBUG "dahdi_mydev: Setting hook state to %d (%02x)\n", txsig, reg);
return 0;
}
dahdi_rbs_sethook()->mydev_hooksig() 函数在 dahdi 模块中的多个位置调用,并且总是用 spin_lock_irqsave()/spin_unlock_irqrestore() 包裹,但是我不明白为什么只连接一张卡问题就消失了。
所以我就是不知道如何从这个回调中正确发送 netlink 消息。 netlink_broadcast 是否有任何非睡眠变体?
add1:尝试 spin_unlock 发送 spin_lock - 没有帮助。
add2:这是我的 nl_send_cmd() 函数的代码
int nl_send_cmd(int chan, char *serial, char *dial, int op) {
int rc = 0;
struct sk_buff *skb;
void *msg_head;
pr_debug("dahdi_mydev: trying to sent dial string to userspace\n");
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
pr_err("dahdi_mydev: genlmsg_new() failed.\n");
return -ENOMEM;
}
msg_head = genlmsg_put(skb, 0, 0, &span_gnl_family, 0, SPAN_DIAL_CMD);
if (!msg_head) {
pr_err("dahdi_mydev: genlmsg_put() failed.\n");
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_string(skb, ATTR1_STRING, serial);
if (rc) {
pr_err("dahdi_mydev: nla_put_string() failed for serial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_string(skb, ATTR4_STRING, dial);
if (rc) {
pr_err("dahdi_mydev: nla_put_string() failed for dial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_u32(skb, ATTR2_SINT32, chan);
if (rc) {
pr_err("dahdi_mydev: nla_put_sint32() failed for dial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_u32(skb, ATTR5_SINT32, op);
if (rc) {
pr_err("dahdi_mydev: nla_put_sint32() failed for dial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
genlmsg_end(skb, msg_head);
rc = genlmsg_multicast(&span_gnl_family, skb, 0, 0, GFP_KERNEL);
if (rc) {
pr_info("dahdi_mydev: Dial message didn't sent - no listeners ?\n");
return -ENOTCONN;
}
pr_debug("dahdi_mydev: NL msg for %s sent with '%s' op %d\n", serial, dial, op);
return 0;
}
好的。我花了一些时间找到了答案。解决方案是在 genlmsg_new() 和 genlmsg_multicast() 调用中用 GFP_ATOMIC 替换 GFP_KERNEL 因为结果 netlink_broadcast() 函数内部有这个:
if (info.delivered) {
if (info.congested && gfpflags_allow_blocking(allocation))
yield();
return 0;
}
我认为在单 USB 卡模式下,拥塞标志是错误的,根本没有睡眠。连接另一张卡后,出现大量网络链接消息 -> 拥塞标志上升 + 分配标志 = GFP_KERNEL -> 在锁定部分内休眠。
我正在为硬件 phone 模块开发自定义 dahdi 驱动程序。该模块通过 usb(用户空间 libusb D2XX ftdi 驱动程序)连接到 pc。所以驱动程序被分成两部分:内核模式 dahdi 驱动程序和用户空间服务。
用户空间守护进程检测到 usb 卡并要求内核 dahdi 驱动程序使用 netlink 到 add/remove dahdi 跨度。添加新跨度后,它可以通信设备、send/receive 音频和命令。
我通过单张 phone 卡成功测试(热插拔设备、拨号、通话、热移除没有任何错误),开始多张卡测试并在 netlink_broadcast() 获得 "BUG: scheduling while atomic"通话:
[ 5322.363190] [<ffffffffae709113>] __schedule_bug+0x64/0x72
[ 5322.363196] [<ffffffffae713fdb>] __schedule+0x9fb/0xa20
[ 5322.363202] [<ffffffffae0cc1d6>] __cond_resched+0x26/0x30
[ 5322.363205] [<ffffffffae7142ca>] _cond_resched+0x3a/0x50
[ 5322.363209] [<ffffffffae1f7a35>] kmem_cache_alloc+0x35/0x1f0
[ 5322.363216] [<ffffffffae5d7ad9>] ? skb_clone+0x49/0xb0
[ 5322.363218] [<ffffffffae5d7ad9>] skb_clone+0x49/0xb0
[ 5322.363225] [<ffffffffae623ca1>] netlink_broadcast_filtered+0x331/0x3e0
[ 5322.363227] [<ffffffffae623d6d>] netlink_broadcast+0x1d/0x20
[ 5322.363231] [<ffffffffc0c113ca>] nl_send_cmd+0x15a/0x260 [mydev]
[ 5322.363234] [<ffffffffc0c11531>] mydev_hooksig+0x61/0x80 [mydev]
[ 5322.363240] [<ffffffffc082ecdb>] dahdi_rbs_sethook+0x9b/0x220 [dahdi]
[ 5322.363244] [<ffffffffc0833566>] _dahdi_transmit+0x4c6/0x5b0 [dahdi]
[ 5322.363248] [<ffffffffc08368f5>] ? _dahdi_receive+0x235/0x3a0 [dahdi]
[ 5322.363250] [<ffffffffc0c10436>] ? queue_write+0x66/0xd0 [mydev]
[ 5322.363252] [<ffffffffc0c10963>] mydev_tick+0x1e3/0x2b0 [mydev]
[ 5322.363256] [<ffffffffc08365fe>] _process_masterspan+0x5be/0x680 [dahdi]
[ 5322.363259] [<ffffffffc0836a1c>] _dahdi_receive+0x35c/0x3a0 [dahdi]
[ 5322.363263] [<ffffffffc07b28b7>] g4_interrupt+0x3b7/0xc7b [opvxg4xx]
[ 5322.363266] [<ffffffffc08332da>] ? _dahdi_transmit+0x23a/0x5b0 [dahdi]
[ 5322.363270] [<ffffffffc08368f5>] ? _dahdi_receive+0x235/0x3a0 [dahdi]
[ 5322.363274] [<ffffffffae141284>] __handle_irq_event_percpu+0x44/0x1c0
[ 5322.363276] [<ffffffffae141432>] handle_irq_event_percpu+0x32/0x80
[ 5322.363277] [<ffffffffae1414bc>] handle_irq_event+0x3c/0x60
[ 5322.363281] [<ffffffffae144ab9>] handle_fasteoi_irq+0x59/0x110
[ 5322.363285] [<ffffffffae02d504>] handle_irq+0xe4/0x1a0
[ 5322.363290] [<ffffffffae1029fc>] ? tick_check_idle+0x8c/0xd0
内核驱动结构非常简单。它使用回调列表注册 dahdi 设备:
static const struct dahdi_span_ops mydev_span_ops = {
.owner = THIS_MODULE,
.hooksig = mydev_hooksig,
.spanconfig = mydev_spanconfig,
.chanconfig = mydev_chanconfig,
.startup = mydev_startup,
.shutdown = mydev_shutdown,
.open = mydev_open,
.close = mydev_close,
.ioctl = mydev_ioctl,
.sync_tick = mydev_tick,
};
hooksig 操作从 handoff/hangon 上的 dahdi 模块调用,它只是将 netlink 消息发送到用户空间守护程序:
static int mydev_hooksig(struct dahdi_chan *chan, enum dahdi_txsig txsig)
{
struct mydevp *wc = NULL;
int reg=0;
if (chan == NULL) return 0;
wc = chan->pvt;
switch(txsig) {
case DAHDI_TXSIG_START:
case DAHDI_TXSIG_OFFHOOK:
nl_send_cmd(chan->chanpos, wc->serial, "off", OP_OFFHOOK);
break;
case DAHDI_TXSIG_ONHOOK:
nl_send_cmd(chan->chanpos, wc->serial, "on", OP_ONHOOK);
break;
default:
printk(KERN_NOTICE "dahdi_mydev: Can't set tx state to unknown %d\n", txsig);
}
printk(KERN_DEBUG "dahdi_mydev: Setting hook state to %d (%02x)\n", txsig, reg);
return 0;
}
dahdi_rbs_sethook()->mydev_hooksig() 函数在 dahdi 模块中的多个位置调用,并且总是用 spin_lock_irqsave()/spin_unlock_irqrestore() 包裹,但是我不明白为什么只连接一张卡问题就消失了。
所以我就是不知道如何从这个回调中正确发送 netlink 消息。 netlink_broadcast 是否有任何非睡眠变体?
add1:尝试 spin_unlock 发送 spin_lock - 没有帮助。
add2:这是我的 nl_send_cmd() 函数的代码
int nl_send_cmd(int chan, char *serial, char *dial, int op) {
int rc = 0;
struct sk_buff *skb;
void *msg_head;
pr_debug("dahdi_mydev: trying to sent dial string to userspace\n");
skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
if (!skb) {
pr_err("dahdi_mydev: genlmsg_new() failed.\n");
return -ENOMEM;
}
msg_head = genlmsg_put(skb, 0, 0, &span_gnl_family, 0, SPAN_DIAL_CMD);
if (!msg_head) {
pr_err("dahdi_mydev: genlmsg_put() failed.\n");
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_string(skb, ATTR1_STRING, serial);
if (rc) {
pr_err("dahdi_mydev: nla_put_string() failed for serial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_string(skb, ATTR4_STRING, dial);
if (rc) {
pr_err("dahdi_mydev: nla_put_string() failed for dial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_u32(skb, ATTR2_SINT32, chan);
if (rc) {
pr_err("dahdi_mydev: nla_put_sint32() failed for dial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
rc = nla_put_u32(skb, ATTR5_SINT32, op);
if (rc) {
pr_err("dahdi_mydev: nla_put_sint32() failed for dial: %d\n", rc);
kfree_skb(skb);
return -ENOMEM;
}
genlmsg_end(skb, msg_head);
rc = genlmsg_multicast(&span_gnl_family, skb, 0, 0, GFP_KERNEL);
if (rc) {
pr_info("dahdi_mydev: Dial message didn't sent - no listeners ?\n");
return -ENOTCONN;
}
pr_debug("dahdi_mydev: NL msg for %s sent with '%s' op %d\n", serial, dial, op);
return 0;
}
好的。我花了一些时间找到了答案。解决方案是在 genlmsg_new() 和 genlmsg_multicast() 调用中用 GFP_ATOMIC 替换 GFP_KERNEL 因为结果 netlink_broadcast() 函数内部有这个:
if (info.delivered) {
if (info.congested && gfpflags_allow_blocking(allocation))
yield();
return 0;
}
我认为在单 USB 卡模式下,拥塞标志是错误的,根本没有睡眠。连接另一张卡后,出现大量网络链接消息 -> 拥塞标志上升 + 分配标志 = GFP_KERNEL -> 在锁定部分内休眠。