如何使用 ibverbs 接收以太网帧?
How can I receive Ethernet frames with ibverbs?
我想编写一个简单的测试程序来使用 ibverbs API 接收以太网帧。
下面的代码编译并运行但从未收到任何数据包。我在 Ubuntu 18.
上使用 Mellanox ConnectX-3 硬件
问题:
如果在运行这个RX程序中,我从另一台机器ping Inifiniband接口,然后ping收到响应。我不希望因为 RX 程序应该抓取 ping 请求并且 Linux IP 堆栈不应该看到它们,因此不应该响应。应该发生什么?
我的代码有什么明显的错误吗?
我需要转向规则吗?如果我删除 ibv_create_flow() 的调用,我应该只接收接口看到的所有数据包吗?
#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>
#define PORT_NUM 1
#define MAX_MSG_SIZE 1500 // The maximum size of each received packet.
#define RQ_NUM_DESC 512 // Max packets that can be received without processing.
// The MAC of the interface we are listening on.
#define DEST_MAC { 0x00, 0x0d, 0x3a, 0x47, 0x1c, 0x2e }
#define FATAL_ERROR(msg, ...) { fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); exit(-1); }
int main() {
// Get the list of devices.
int num_devices = 0;
struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
if (!dev_list)
FATAL_ERROR("Failed to get IB devices list.");
// Choose the first device.
struct ibv_device *ib_dev = dev_list[0];
if (!ib_dev)
FATAL_ERROR("IB device not found.");
printf("Found %i Infiniband device(s).\n", num_devices);
printf("Using device '%s'.\n", ibv_get_device_name(ib_dev));
// Get the device context.
struct ibv_context *context = ibv_open_device(ib_dev);
if (!context)
FATAL_ERROR("Couldn't get context for device.");
// Allocate a protection domain (PD) that will group memory
// regions (MR) and rings.
struct ibv_pd *pd = ibv_alloc_pd(context);
if (!pd)
FATAL_ERROR("Couldn't allocate protection domain.");
// Create Complition Queue (CQ).
struct ibv_cq *cq = ibv_create_cq(context, RQ_NUM_DESC, NULL, NULL, 0);
if (!cq)
FATAL_ERROR("Couldn't create completion queue. errno = %d.", errno);
// Create Queue Pair (QP).
struct ibv_qp_init_attr qp_init_attr = {
.qp_context = NULL,
.send_cq = cq, // Report receive completion to CQ.
.recv_cq = cq,
.cap = {
.max_send_wr = 0, // No send ring.
.max_recv_wr = RQ_NUM_DESC, // Max num packets in ring.
.max_recv_sge = 1, // Only one pointer per descriptor.
},
.qp_type = IBV_QPT_RAW_PACKET, // Use Ethernet packets.
};
struct ibv_qp *qp = ibv_create_qp(pd, &qp_init_attr);
if (!qp)
FATAL_ERROR("Couldn't create queue pair.");
// Initialize the QP (receive ring) and assign a port.
struct ibv_qp_attr qp_attr = { 0 };
qp_attr.qp_state = IBV_QPS_INIT;
qp_attr.port_num = PORT_NUM;
int qp_flags = IBV_QP_STATE | IBV_QP_PORT;
if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
FATAL_ERROR("Failed to initialize queue pair.");
// Move ring state to ready-to-receive. This is needed in
// order to be able to receive packets.
memset(&qp_attr, 0, sizeof(qp_attr));
qp_flags = IBV_QP_STATE;
qp_attr.qp_state = IBV_QPS_RTR;
if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
FATAL_ERROR("Failed to put queue pair into ready-to-receive state.");
// Allocate memory for packet buffer.
int buf_size = MAX_MSG_SIZE * RQ_NUM_DESC; // Maximum size of data to be accessed by hardware.
void *buf = malloc(buf_size);
if (!buf)
FATAL_ERROR("Couldn't allocate memory.");
// Register the user memory so it can be accessed by the HW directly.
struct ibv_mr *mr = ibv_reg_mr(pd, buf, buf_size, IBV_ACCESS_LOCAL_WRITE);
if (!mr)
FATAL_ERROR("Couldn't register memory region.");
// Create a scatter/gather entry.
struct ibv_sge sg_entry;
sg_entry.length = MAX_MSG_SIZE;
sg_entry.lkey = mr->lkey;
// Create a receive work request.
struct ibv_recv_wr wr;
wr.num_sge = 1;
wr.sg_list = &sg_entry;
wr.next = NULL;
// Post a load of receive work requests onto the receive queue.
struct ibv_recv_wr *bad_wr;
for (int n = 0; n < RQ_NUM_DESC; n++) {
// Each descriptor points to max MTU size buffer.
sg_entry.addr = (uint64_t)buf + MAX_MSG_SIZE * n;
// When a packet is received, a work completion will be created
// corresponding to this work request. It will contain this field.
wr.wr_id = n;
// Post the receive buffer to the ring.
int rv = ibv_post_recv(qp, &wr, &bad_wr);
if (rv != 0) {
FATAL_ERROR("Posting recv failed with error code %i.", rv);
}
}
// Create steering rule.
struct raw_eth_flow_attr {
struct ibv_flow_attr attr;
struct ibv_flow_spec_eth spec_eth;
} __attribute__((packed)) flow_attr = {
.attr = {
.comp_mask = 0,
.type = IBV_FLOW_ATTR_NORMAL,
.size = sizeof(flow_attr),
.priority = 0,
.num_of_specs = 1,
.port = PORT_NUM,
.flags = 0,
},
.spec_eth = {
.type = IBV_FLOW_SPEC_ETH,
.size = sizeof(struct ibv_flow_spec_eth),
.val = {
.dst_mac = DEST_MAC,
.src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
.ether_type = 0,
.vlan_tag = 0,
},
.mask = {
.dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
.src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
.ether_type = 0,
.vlan_tag = 0,
}
}
};
// Register steering rule to intercept packet to DEST_MAC and place packet in
// ring pointed by qp.
struct ibv_flow *eth_flow = ibv_create_flow(qp, &flow_attr.attr);
if (!eth_flow)
FATAL_ERROR("Couldn't attach steering flow. Does DEST_MAC match that of the local NIC?");
printf("Receiving.\n");
while (1) {
// Wait for CQ event upon message received, and print a message
struct ibv_wc wc;
int msgs_completed = ibv_poll_cq(cq, 1, &wc);
if (msgs_completed > 0) {
printf("Message %ld received size %d\n", wc.wr_id, wc.byte_len);
sg_entry.addr = (uint64_t)buf + wc.wr_id * MAX_MSG_SIZE;
wr.wr_id = wc.wr_id;
// After processed need to post back the buffer.
int rv = ibv_post_recv(qp, &wr, &bad_wr);
if (rv != 0) {
FATAL_ERROR("Re-posting recv failed with error code %i.", rv);
}
}
else if (msgs_completed < 0) {
FATAL_ERROR("Polling error.");
}
}
}
请参考https://github.com/Mellanox/libvma/wiki/Architecture
VMA 实现本机 RDMA 动词 API。本机 RDMA 动词已扩展到支持 RDMA 的以太网 NIC,使数据包能够直接在用户应用程序和 InfiniBand HCA 或以太网 NIC 之间传递,绕过内核及其 TCP/UDP 处理网络堆栈。
看看 Mellanox 的这个例子:https://community.mellanox.com/s/article/raw-ethernet-programming--basic-introduction---code-example
要接收界面看到的所有内容,您可以使用实验性的 api #include <infiniband/verbs_exp.h>
,然后在创建转向规则时,使用 ibv_exp_flow_attr
并将类型设置为 IBV_EXP_FLOW_ATTR_SNIFFER
.
我想编写一个简单的测试程序来使用 ibverbs API 接收以太网帧。
下面的代码编译并运行但从未收到任何数据包。我在 Ubuntu 18.
上使用 Mellanox ConnectX-3 硬件问题:
如果在运行这个RX程序中,我从另一台机器ping Inifiniband接口,然后ping收到响应。我不希望因为 RX 程序应该抓取 ping 请求并且 Linux IP 堆栈不应该看到它们,因此不应该响应。应该发生什么?
我的代码有什么明显的错误吗?
我需要转向规则吗?如果我删除 ibv_create_flow() 的调用,我应该只接收接口看到的所有数据包吗?
#include <infiniband/verbs.h>
#include <stdio.h>
#include <stdlib.h>
#define PORT_NUM 1
#define MAX_MSG_SIZE 1500 // The maximum size of each received packet.
#define RQ_NUM_DESC 512 // Max packets that can be received without processing.
// The MAC of the interface we are listening on.
#define DEST_MAC { 0x00, 0x0d, 0x3a, 0x47, 0x1c, 0x2e }
#define FATAL_ERROR(msg, ...) { fprintf(stderr, "ERROR: " msg "\n", ##__VA_ARGS__); exit(-1); }
int main() {
// Get the list of devices.
int num_devices = 0;
struct ibv_device **dev_list = ibv_get_device_list(&num_devices);
if (!dev_list)
FATAL_ERROR("Failed to get IB devices list.");
// Choose the first device.
struct ibv_device *ib_dev = dev_list[0];
if (!ib_dev)
FATAL_ERROR("IB device not found.");
printf("Found %i Infiniband device(s).\n", num_devices);
printf("Using device '%s'.\n", ibv_get_device_name(ib_dev));
// Get the device context.
struct ibv_context *context = ibv_open_device(ib_dev);
if (!context)
FATAL_ERROR("Couldn't get context for device.");
// Allocate a protection domain (PD) that will group memory
// regions (MR) and rings.
struct ibv_pd *pd = ibv_alloc_pd(context);
if (!pd)
FATAL_ERROR("Couldn't allocate protection domain.");
// Create Complition Queue (CQ).
struct ibv_cq *cq = ibv_create_cq(context, RQ_NUM_DESC, NULL, NULL, 0);
if (!cq)
FATAL_ERROR("Couldn't create completion queue. errno = %d.", errno);
// Create Queue Pair (QP).
struct ibv_qp_init_attr qp_init_attr = {
.qp_context = NULL,
.send_cq = cq, // Report receive completion to CQ.
.recv_cq = cq,
.cap = {
.max_send_wr = 0, // No send ring.
.max_recv_wr = RQ_NUM_DESC, // Max num packets in ring.
.max_recv_sge = 1, // Only one pointer per descriptor.
},
.qp_type = IBV_QPT_RAW_PACKET, // Use Ethernet packets.
};
struct ibv_qp *qp = ibv_create_qp(pd, &qp_init_attr);
if (!qp)
FATAL_ERROR("Couldn't create queue pair.");
// Initialize the QP (receive ring) and assign a port.
struct ibv_qp_attr qp_attr = { 0 };
qp_attr.qp_state = IBV_QPS_INIT;
qp_attr.port_num = PORT_NUM;
int qp_flags = IBV_QP_STATE | IBV_QP_PORT;
if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
FATAL_ERROR("Failed to initialize queue pair.");
// Move ring state to ready-to-receive. This is needed in
// order to be able to receive packets.
memset(&qp_attr, 0, sizeof(qp_attr));
qp_flags = IBV_QP_STATE;
qp_attr.qp_state = IBV_QPS_RTR;
if (ibv_modify_qp(qp, &qp_attr, qp_flags) < 0)
FATAL_ERROR("Failed to put queue pair into ready-to-receive state.");
// Allocate memory for packet buffer.
int buf_size = MAX_MSG_SIZE * RQ_NUM_DESC; // Maximum size of data to be accessed by hardware.
void *buf = malloc(buf_size);
if (!buf)
FATAL_ERROR("Couldn't allocate memory.");
// Register the user memory so it can be accessed by the HW directly.
struct ibv_mr *mr = ibv_reg_mr(pd, buf, buf_size, IBV_ACCESS_LOCAL_WRITE);
if (!mr)
FATAL_ERROR("Couldn't register memory region.");
// Create a scatter/gather entry.
struct ibv_sge sg_entry;
sg_entry.length = MAX_MSG_SIZE;
sg_entry.lkey = mr->lkey;
// Create a receive work request.
struct ibv_recv_wr wr;
wr.num_sge = 1;
wr.sg_list = &sg_entry;
wr.next = NULL;
// Post a load of receive work requests onto the receive queue.
struct ibv_recv_wr *bad_wr;
for (int n = 0; n < RQ_NUM_DESC; n++) {
// Each descriptor points to max MTU size buffer.
sg_entry.addr = (uint64_t)buf + MAX_MSG_SIZE * n;
// When a packet is received, a work completion will be created
// corresponding to this work request. It will contain this field.
wr.wr_id = n;
// Post the receive buffer to the ring.
int rv = ibv_post_recv(qp, &wr, &bad_wr);
if (rv != 0) {
FATAL_ERROR("Posting recv failed with error code %i.", rv);
}
}
// Create steering rule.
struct raw_eth_flow_attr {
struct ibv_flow_attr attr;
struct ibv_flow_spec_eth spec_eth;
} __attribute__((packed)) flow_attr = {
.attr = {
.comp_mask = 0,
.type = IBV_FLOW_ATTR_NORMAL,
.size = sizeof(flow_attr),
.priority = 0,
.num_of_specs = 1,
.port = PORT_NUM,
.flags = 0,
},
.spec_eth = {
.type = IBV_FLOW_SPEC_ETH,
.size = sizeof(struct ibv_flow_spec_eth),
.val = {
.dst_mac = DEST_MAC,
.src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 },
.ether_type = 0,
.vlan_tag = 0,
},
.mask = {
.dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
.src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
.ether_type = 0,
.vlan_tag = 0,
}
}
};
// Register steering rule to intercept packet to DEST_MAC and place packet in
// ring pointed by qp.
struct ibv_flow *eth_flow = ibv_create_flow(qp, &flow_attr.attr);
if (!eth_flow)
FATAL_ERROR("Couldn't attach steering flow. Does DEST_MAC match that of the local NIC?");
printf("Receiving.\n");
while (1) {
// Wait for CQ event upon message received, and print a message
struct ibv_wc wc;
int msgs_completed = ibv_poll_cq(cq, 1, &wc);
if (msgs_completed > 0) {
printf("Message %ld received size %d\n", wc.wr_id, wc.byte_len);
sg_entry.addr = (uint64_t)buf + wc.wr_id * MAX_MSG_SIZE;
wr.wr_id = wc.wr_id;
// After processed need to post back the buffer.
int rv = ibv_post_recv(qp, &wr, &bad_wr);
if (rv != 0) {
FATAL_ERROR("Re-posting recv failed with error code %i.", rv);
}
}
else if (msgs_completed < 0) {
FATAL_ERROR("Polling error.");
}
}
}
请参考https://github.com/Mellanox/libvma/wiki/Architecture VMA 实现本机 RDMA 动词 API。本机 RDMA 动词已扩展到支持 RDMA 的以太网 NIC,使数据包能够直接在用户应用程序和 InfiniBand HCA 或以太网 NIC 之间传递,绕过内核及其 TCP/UDP 处理网络堆栈。
看看 Mellanox 的这个例子:https://community.mellanox.com/s/article/raw-ethernet-programming--basic-introduction---code-example
要接收界面看到的所有内容,您可以使用实验性的 api #include <infiniband/verbs_exp.h>
,然后在创建转向规则时,使用 ibv_exp_flow_attr
并将类型设置为 IBV_EXP_FLOW_ATTR_SNIFFER
.