DPDK KNI原理和实现
——lvyilong316
kni的整体的整体实现原理是采用共享内存方式,如下图通过DPDK创建rx_q,tx_q,并将其地址传递给kni内核模块,来实现rx_q和tx_q的内存共享,之后DPDK和内核就可以将数据放入对应的共享内存队列中完成报文的收发。下面分别介绍kni的内核部分和用户态部分。
用户态部分
首先我们看用户态部分。用户态部分我们直接看DPDK的kni example。其中关键初始化函数是main函数中调用的init_kni()。其中主要调用rte_kni_init。
-
void
-
rte_kni_init(unsigned int max_kni_ifaces)
-
{
-
uint32_t i;
-
struct rte_kni_memzone_slot *it;
-
const struct rte_memzone *mz;
-
#define OBJNAMSIZ 32
-
char obj_name[OBJNAMSIZ];
-
char mz_name[RTE_MEMZONE_NAMESIZE];
-
-
/* Immediately return if KNI is already initialized */
-
if (kni_memzone_pool.initialized) {
-
RTE_LOG(WARNING, KNI, "Double call to rte_kni_init()");
-
return;
-
}
-
-
if (max_kni_ifaces == 0) {
-
RTE_LOG(ERR, KNI, "Invalid number of max_kni_ifaces %d\n",
-
max_kni_ifaces);
-
RTE_LOG(ERR, KNI, "Unable to initialize KNI\n");
-
return;
-
}
-
/* 对/dev/kni调用open */
-
/* Check FD and open */
-
if (kni_fd < 0) {
-
kni_fd = open("/dev/" KNI_DEVICE, O_RDWR);
-
if (kni_fd < 0) {
-
RTE_LOG(ERR, KNI,
-
"Can not open /dev/%s\n", KNI_DEVICE);
-
return;
-
}
-
}
-
-
/* Allocate slot objects */
-
kni_memzone_pool.slots = (struct rte_kni_memzone_slot *)
-
rte_malloc(NULL,
-
sizeof(struct rte_kni_memzone_slot) *
-
max_kni_ifaces,
-
0);
-
KNI_MEM_CHECK(kni_memzone_pool.slots == NULL);
-
-
/* Initialize general pool variables */
-
kni_memzone_pool.initialized = 1;
-
kni_memzone_pool.max_ifaces = max_kni_ifaces;
-
kni_memzone_pool.free = &kni_memzone_pool.slots[0];
-
rte_spinlock_init(&kni_memzone_pool.mutex);
-
/* 根据kni设备数量初始化对应的kni slot,每个slot存放着kni需要使用的各种queue */
-
/* Pre-allocate all memzones of all the slots; panic on error */
-
for (i = 0; i < max_kni_ifaces; i++) {
-
-
/* Recover current slot */
-
it = &kni_memzone_pool.slots[i];
-
it->id = i;
-
-
/* Allocate KNI context */
-
snprintf(mz_name, RTE_MEMZONE_NAMESIZE, "KNI_INFO_%d", i);
-
mz = kni_memzone_reserve(mz_name, sizeof(struct rte_kni),
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_ctx = mz;
-
-
/* TX RING */
-
snprintf(obj_name, OBJNAMSIZ, "kni_tx_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_tx_q = mz;
-
-
/* RX RING */
-
snprintf(obj_name, OBJNAMSIZ, "kni_rx_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_rx_q = mz;
-
-
/* ALLOC RING */
-
snprintf(obj_name, OBJNAMSIZ, "kni_alloc_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_alloc_q = mz;
-
-
/* FREE RING */
-
snprintf(obj_name, OBJNAMSIZ, "kni_free_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_free_q = mz;
-
-
/* Request RING */
-
snprintf(obj_name, OBJNAMSIZ, "kni_req_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_req_q = mz;
-
-
/* Response RING */
-
snprintf(obj_name, OBJNAMSIZ, "kni_resp_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_resp_q = mz;
-
-
/* Req/Resp sync mem area */
-
snprintf(obj_name, OBJNAMSIZ, "kni_sync_%d", i);
-
mz = kni_memzone_reserve(obj_name, KNI_FIFO_SIZE,
-
SOCKET_ID_ANY, 0);
-
KNI_MEM_CHECK(mz == NULL);
-
it->m_sync_addr = mz;
-
-
if ((i+1) == max_kni_ifaces) {
-
it->next = NULL;
-
kni_memzone_pool.free_tail = it;
-
} else
-
it->next = &kni_memzone_pool.slots[i+1];
-
}
-
-
return;
-
-
kni_fail:
-
RTE_LOG(ERR, KNI, "Unable to allocate memory for max_kni_ifaces:%d."
-
"Increase the amount of hugepages memory\n", max_kni_ifaces);
-
}
其核心就是根据kni设备数量初始化对应的 rte_kni_memzone_slot,其中 rte_kni_memzone_slot中主要保存着这个kni设备需要在数据传输中使用的各种queue,如tx_q,rx_q,free_q等,每个queue对应一个DPDK memzone。初始化完如下图所示。
另外一个关键初始化函数是main函数中对每个kni port调用的kni_alloc,而kni_alloc中主要调用rte_kni_alloc。关于rte_kni_alloc的部分这里就不再展开,主要是分配一个可用的rte_kni_memzone_slot,并将其相关信息赋值给一个struct rte_kni_device_info结构,然后用struct rte_kni_device_info作为参数调用kni设备的ioctl create:
ret = ioctl(kni_fd, RTE_KNI_IOCTL_CREATE, &dev_info);
这样就将DPDK用户态那些queue对应的内存和内核kni设备关联起来了。然后我们看用户态数据是如何发往内核的。DPDK侧的主要发送函数为rte_kni_tx_burst
-
unsigned
-
rte_kni_tx_burst(struct rte_kni *kni, struct rte_mbuf **mbufs, unsigned num)
-
{
-
void *phy_mbufs[num];
-
unsigned int ret;
-
unsigned int i;
-
-
for (i = 0; i < num; i++)
-
phy_mbufs[i] = va2pa(mbufs[i]);
-
-
ret = kni_fifo_put(kni->rx_q, phy_mbufs, num);
-
-
/* Get mbufs from free_q and then free them */
-
kni_free_mbufs(kni);
-
-
return ret;
-
}
rte_kni_tx_burst主要调用 kni_fifo_put将mbuf的物理地址写入fifo中。
-
static inline uint32_t
-
kni_fifo_put(struct rte_kni_fifo *fifo, void **data, uint32_t num)
-
{
-
uint32_t i = 0;
-
uint32_t fifo_write = fifo->write;
-
uint32_t fifo_read = fifo->read;
-
uint32_t new_write = fifo_write;
-
-
for (i = 0; i < num; i++) {
-
new_write = (new_write + 1) & (fifo->len - 1);
-
-
if (new_write == fifo_read)
-
break;
-
fifo->buffer[fifo_write] = data[i];
-
fifo_write = new_write;
-
}
-
fifo->write = fifo_write;
-
-
return i;
-
}
至于内核的接收部分的接收逻辑我们在后续内核部分讲。用户态的接收逻辑也类似,从对应的rx_q fifo中取出数据转换为mbuf,这里就不再展开了。
内核部分
KNI是DPDK中带有的内核模块,在新版本的DPDK中KNI已经从DPDK代码库中移除独立维护了。我们这里就以老版本的DPDK来分析,以DPDK17.11.2来说,KNI模块对应的代码路径为:dpdk-stable-17.11.2\lib\librte_eal\linuxapp\kni。
其初始化函数为kni_init。
-
static int __init
-
kni_init(void)
-
{
-
int rc;
-
-
if (kni_parse_kthread_mode() < 0) { //kni的线程模式、单线程还是多线程
-
pr_err("Invalid parameter for kthread_mode\n");
-
return -EINVAL;
-
}
-
-
if (multiple_kthread_on == 0)
-
pr_debug("Single kernel thread for all KNI devices\n");
-
else
-
pr_debug("Multiple kernel thread mode enabled\n");
-
-
#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS
-
rc = register_pernet_subsys(&kni_net_ops); //在每个namespace中执行kni_net_ops->init函数
-
#else
-
rc = register_pernet_gen_subsys(&kni_net_id, &kni_net_ops);
-
#endif
-
if (rc)
-
return -EPERM;
-
-
rc = misc_register(&kni_misc); //注册kni misc设备
-
if (rc != 0) {
-
pr_err("Misc registration failed\n");
-
goto out;
-
}
-
-
/* Configure the lo mode according to the input parameter */
-
kni_net_config_lo_mode(lo_mode);
-
-
return 0;
-
-
out:
-
#ifdef HAVE_SIMPLIFIED_PERNET_OPERATIONS
-
unregister_pernet_subsys(&kni_net_ops);
-
#else
-
unregister_pernet_gen_subsys(kni_net_id, &kni_net_ops);
-
#endif
-
return rc;
-
}
代码比较简单,首先选择kni的线程模式,分为单线程还是多线程,所谓单线程是指所有的kni端口收发都由一个线程守护,多线程只是每一个kni端口分为由一个线程守护,这部分是在插入模块时带入参数选择。
接着调用register_pernet_subsys在每个namespace中调用kni_net_ops->init函数,也就是kni_init_net。kni_init_net会在对应的namespace分配struct kni_net结构。
最后调用注册函数misc_register,将kni注册为一个混杂设备,注册之后就会出现/dev/kni路径,用户态程序就可以对其进行open。其中kni_misc结构体里面定义了该混杂设备的一些操作
-
static struct miscdevice kni_misc = {
-
.minor = MISC_DYNAMIC_MINOR,
-
.name = KNI_DEVICE,
-
.fops = &kni_fops,
-
};
这里主要看.fops里面的结构体
-
static const struct file_operations kni_fops = {
-
.owner = THIS_MODULE,
-
.open = kni_open,
-
.release = kni_release,
-
.unlocked_ioctl = (void *)kni_ioctl,
-
.compat_ioctl = (void *)kni_compat_ioctl,
-
};
其中任何用户态程序(如DPDK)要想内核kni发送数据前都需要调用这里的open函数,即kni_open。
-
static int
-
kni_open(struct inode *inode, struct file *file)
-
{
-
struct net *net = current->nsproxy->net_ns;
-
struct kni_net *knet = net_generic(net, kni_net_id);
-
-
/* kni device can be opened by one user only per netns */
-
if (test_and_set_bit(KNI_DEV_IN_USE_BIT_NUM, &knet->device_in_use))
-
return -EBUSY;
-
-
file->private_data = get_net(net);
-
pr_debug("/dev/kni opened\n");
-
-
return 0;
-
}
kni_open主要是完成内核file结构private_data的设置。内核另外一个关键初始化函数为kni_ioctl_create,这是由用户态程序调用ioctl create执行的。此外我们注意到kni的kni_fops没有read/write函数,所以其不能像tap设备一样使用读写文件的方式进行数据收发。下面重点看kni_ioctl_create,简化后其主要流程如下:
-
static int
-
kni_ioctl_create(struct net *net, uint32_t ioctl_num,
-
unsigned long ioctl_param)
-
{
-
struct kni_net *knet = net_generic(net, kni_net_id);
-
int ret;
-
struct rte_kni_device_info dev_info;
-
struct net_device *net_dev = NULL;
-
struct kni_dev *kni, *dev, *n;
-
-
/* Copy kni info from user space */
-
ret = copy_from_user(&dev_info, (void *)ioctl_param, sizeof(dev_info));
-
if (ret) {
-
pr_err("copy_from_user in kni_ioctl_create");
-
return -EIO;
-
}
-
-
list_for_each_entry_safe(dev, n, &knet->kni_list_head, list) {
-
if (kni_check_param(dev, &dev_info) < 0) {
-
up_read(&knet->kni_list_lock);
-
return -EINVAL;
-
}
-
}
-
up_read(&knet->kni_list_lock);
-
-
/* 分配struct net_device结构,并将struct kni_dev作为net_device的private,并使用入参初始化 */
-
net_dev = alloc_netdev(sizeof(struct kni_dev), dev_info.name,
-
kni_net_init);
-
if (net_dev == NULL) {
-
pr_err("error allocating device \"%s\"\n", dev_info.name);
-
return -EBUSY;
-
}
-
-
dev_net_set(net_dev, net);
-
-
kni = netdev_priv(net_dev);
-
-
kni->net_dev = net_dev;
-
kni->group_id = dev_info.group_id;
-
kni->core_id = dev_info.core_id;
-
strncpy(kni->name, dev_info.name, RTE_KNI_NAMESIZE);
-
-
/* Translate user space info into kernel space info */
-
kni->tx_q = phys_to_virt(dev_info.tx_phys);
-
kni->rx_q = phys_to_virt(dev_info.rx_phys);
-
kni->alloc_q = phys_to_virt(dev_info.alloc_phys);
-
kni->free_q = phys_to_virt(dev_info.free_phys);
-
-
kni->req_q = phys_to_virt(dev_info.req_phys);
-
kni->resp_q = phys_to_virt(dev_info.resp_phys);
-
kni->sync_va = dev_info.sync_va;
-
kni->sync_kva = phys_to_virt(dev_info.sync_phys);
-
-
kni->mbuf_size = dev_info.mbuf_size;
-
-
if (kni->lad_dev)
-
ether_addr_copy(net_dev->dev_addr, kni->lad_dev->dev_addr);
-
else
-
/*
-
* Generate random mac address. eth_random_addr() is the newer
-
* version of generating mac address in linux kernel.
-
*/
-
random_ether_addr(net_dev->dev_addr);
-
/* 将net_device结构注册在系统 */
-
ret = register_netdev(net_dev);
-
-
/* 创建数据处理线程 */
-
ret = kni_run_thread(knet, kni, dev_info.force_bind);
-
if (ret != 0)
-
return ret;
-
-
down_write(&knet->kni_list_lock);
-
list_add(&kni->list, &knet->kni_list_head);
-
up_write(&knet->kni_list_lock);
-
-
return 0;
-
}
分配struct net_device结构使用kni_net_init进行初始化,并将struct kni_dev作为net_device的private,并使用入参初始化net_device,入参我们从DPDK用户态部分知道为struct rte_kni_device_info,而其又是存放kni设备对应的rte_kni_memzone_slot信息。所以这里就将用户态DPDK的rte_kni_memzone_slot中存放的各种queue的内存地址传递给内核kni_dev设备,实现了queue区域的内存共享。如下图所示。
随后调用register_netdev将kni对应的struct net_device结构结构注册给系统,因此kni设备我们也能使用ifconfig看到对应的网卡设备,可以使用tcpdump对其进行抓包。
最后kni_run_thread创建内存kni设备的收发线程。我们只看单线程情况,其线程执行函数为kni_thread_single:
-
static int
-
kni_thread_single(void *data)
-
{
-
struct kni_net *knet = data;
-
int j;
-
struct kni_dev *dev;
-
-
while (!kthread_should_stop()) {
-
down_read(&knet->kni_list_lock);
-
for (j = 0; j < KNI_RX_LOOP_NUM; j++) {
-
list_for_each_entry(dev, &knet->kni_list_head, list) {
-
kni_net_rx(dev);
-
kni_net_poll_resp(dev);
-
}
-
}
-
up_read(&knet->kni_list_lock);
-
}
-
-
return 0;
-
}
其主要循环遍历所有kni设备,调用kni_net_rx。而kni_net_rx又进一步调用kni_net_rx_func,kni_net_rx_func被初始化为kni_net_rx_normal。
-
static void
-
kni_net_rx_normal(struct kni_dev *kni)
-
{
-
uint32_t ret;
-
uint32_t len;
-
uint32_t i, num_rx, num_fq;
-
struct rte_kni_mbuf *kva;
-
void *data_kva;
-
struct sk_buff *skb;
-
struct net_device *dev = kni->net_dev;
-
-
/* Get the number of free entries in free_q */
-
num_fq = kni_fifo_free_count(kni->free_q);
-
if (num_fq == 0) {
-
/* No room on the free_q, bail out */
-
return;
-
}
-
-
/* Calculate the number of entries to dequeue from rx_q */
-
num_rx = min_t(uint32_t, num_fq, MBUF_BURST_SZ);
-
-
/* Burst dequeue from rx_q */
-
num_rx = kni_fifo_get(kni->rx_q, kni->pa, num_rx);
-
if (num_rx == 0)
-
return;
-
-
/* Transfer received packets to netif */
-
for (i = 0; i < num_rx; i++) {
-
kva = pa2kva(kni->pa[i]);
-
len = kva->pkt_len;
-
data_kva = kva2data_kva(kva);
-
kni->va[i] = pa2va(kni->pa[i], kva);
-
-
skb = dev_alloc_skb(len + 2);
-
if (!skb) {
-
/* Update statistics */
-
kni->stats.rx_dropped++;
-
continue;
-
}
-
-
/* Align IP on 16B boundary */
-
skb_reserve(skb, 2);
-
-
if (kva->nb_segs == 1) {
-
memcpy(skb_put(skb, len), data_kva, len);
-
} else {
-
int nb_segs;
-
int kva_nb_segs = kva->nb_segs;
-
-
for (nb_segs = 0; nb_segs < kva_nb_segs; nb_segs++) {
-
memcpy(skb_put(skb, kva->data_len),
-
data_kva, kva->data_len);
-
-
if (!kva->next)
-
break;
-
-
kva = pa2kva(va2pa(kva->next, kva));
-
data_kva = kva2data_kva(kva);
-
}
-
}
-
-
skb->dev = dev;
-
skb->protocol = eth_type_trans(skb, dev);
-
skb->ip_summed = CHECKSUM_UNNECESSARY;
-
-
/* Call netif interface */
-
netif_rx_ni(skb);
-
-
/* Update statistics */
-
kni->stats.rx_bytes += len;
-
kni->stats.rx_packets++;
-
}
-
-
/* Burst enqueue mbufs into free_q */
-
ret = kni_fifo_put(kni->free_q, kni->va, num_rx);
-
if (ret != num_rx)
-
/* Failing should not happen */
-
pr_err("Fail to enqueue entries into free_q\n");
-
}
其主要就是从共享内存队列的fifo中取出报文复制到对应的skb中,并注入free buf,最终调用netif_rx_ni(skb)进入协议栈。
如果内核态向DPDK侧发送数据呢?前面讲到分配struct net_device结构使用kni_net_init进行初始化,其中将net_device的netdev_ops设置为kni_net_netdev_ops:
dev->netdev_ops = &kni_net_netdev_ops;
其ndo_start_xmit被设置为kni_net_tx,当我们从kni对应的net设备发送数据时最终就会调用到kni_net_tx。
-
static const struct net_device_ops kni_net_netdev_ops = {
-
.ndo_open = kni_net_open,
-
.ndo_stop = kni_net_release,
-
.ndo_set_config = kni_net_config,
-
.ndo_start_xmit = kni_net_tx,
-
.ndo_change_mtu = kni_net_change_mtu,
-
.ndo_do_ioctl = kni_net_ioctl,
-
.ndo_set_rx_mode = kni_net_set_rx_mode,
-
.ndo_get_stats = kni_net_stats,
-
.ndo_tx_timeout = kni_net_tx_timeout,
-
.ndo_set_mac_address = kni_net_set_mac,
-
#ifdef HAVE_CHANGE_CARRIER_CB
-
.ndo_change_carrier = kni_net_change_carrier,
-
#endif
-
};
-
static int
-
kni_net_tx(struct sk_buff *skb, struct net_device *dev)
-
{
-
int len = 0;
-
uint32_t ret;
-
struct kni_dev *kni = netdev_priv(dev);
-
struct rte_kni_mbuf *pkt_kva = NULL;
-
void *pkt_pa = NULL;
-
void *pkt_va = NULL;
-
-
/**
-
* Check if it has at least one free entry in tx_q and
-
* one entry in alloc_q.
-
*/
-
if (kni_fifo_free_count(kni->tx_q) == 0 ||
-
kni_fifo_count(kni->alloc_q) == 0) {
-
/**
-
* If no free entry in tx_q or no entry in alloc_q,
-
* drops skb and goes out.
-
*/
-
goto drop;
-
}
-
-
/* dequeue a mbuf from alloc_q */
-
ret = kni_fifo_get(kni->alloc_q, &pkt_pa, 1);
-
if (likely(ret == 1)) {
-
void *data_kva;
-
-
pkt_kva = pa2kva(pkt_pa);
-
data_kva = kva2data_kva(pkt_kva);
-
pkt_va = pa2va(pkt_pa, pkt_kva);
-
-
len = skb->len;
-
memcpy(data_kva, skb->data, len);
-
if (unlikely(len < ETH_ZLEN)) {
-
memset(data_kva + len, 0, ETH_ZLEN - len);
-
len = ETH_ZLEN;
-
}
-
pkt_kva->pkt_len = len;
-
pkt_kva->data_len = len;
-
-
/* enqueue mbuf into tx_q */
-
ret = kni_fifo_put(kni->tx_q, &pkt_va, 1);
-
if (unlikely(ret != 1)) {
-
/* Failing should not happen */
-
pr_err("Fail to enqueue mbuf into tx_q\n");
-
goto drop;
-
}
-
} else {
-
/* Failing should not happen */
-
pr_err("Fail to dequeue mbuf from alloc_q\n");
-
goto drop;
-
}
-
-
/* Free skb and update statistics */
-
dev_kfree_skb(skb);
-
kni->stats.tx_bytes += len;
-
kni->stats.tx_packets++;
-
-
return NETDEV_TX_OK;
-
-
drop:
-
/* Free skb and update statistics */
-
dev_kfree_skb(skb);
-
kni->stats.tx_dropped++;
-
-
return NETDEV_TX_OK;
-
}
其主要逻辑就是将skb报文放入tx_q的fifo中。然后另一端DPDK从fifo中取出报文,完成内核到用户态的交互。以上我们就分析了kni设备的的DPDK和内核交互原理。
阅读(4544) | 评论(0) | 转发(0) |