dpdk中断机制
——lvyilong316
这里主要介绍一下dpdk的中断机制,虽然dpdk大多数场景用的是polling模式,但是也是支持中断模式的,另一方面除了收发包之外,设备的其他功能,如状态改变等,还是要依赖中断机制。当然dpdk的中断是用户态的中断,实现方式是通过vfio或uio模块将内核的中断传递到用户态,具体vfio和uio的工作方式不是本文的重点,这里重点关注dpdk的中断处理流程。首先看一下dpdk中断处理相关的初始化流程。
3.5.1 中断初始化
rte_eal_initàrte_eal_intr_init
中断初始化主要在rte_eal_intr_init中完成。
l rte_eal_intr_init
在rte_eal_intr_init()函数中初始化中断。具体如下:
(1) 首先初始化intr_sources链表。所有设备的中断都挂在这个链表上,中断处理线程通过遍历这个链表,来执行设备的中断。
(2) 创建intr_pipe管道,用于epoll模型的消息通知。
(3) 创建线程intr_thread,线程的执行体是eal_intr_thread_main()函数,创建epoll模型,遍历intr_sources链表,监听已注册的所有UIO设备的中断事件,并调用对应UIO设备的中断处理函数。
-
int rte_eal_intr_init(void)
-
{
-
int ret = 0, ret_1 = 0;
-
char thread_name[RTE_MAX_THREAD_NAME_LEN];
-
-
/* init the global interrupt source head */
-
/*初始化intr_sources全局链表,用来存放设备的中断资源*/
-
TAILQ_INIT(&intr_sources);
-
-
/**
-
* create a pipe which will be waited by epoll and notified to
-
* rebuild the wait list of epoll.
-
*/
-
/*创建管道,返回的两个fd存放在全局变量intr_pipe中*/
-
if (pipe(intr_pipe.pipefd) < 0)
-
return -1;
-
/*创建中断处理线程*/
-
/* create the host thread to wait/handle the interrupt */
-
ret = pthread_create(&intr_thread, NULL,
-
eal_intr_thread_main, NULL);
-
if (ret != 0) {
-
RTE_LOG(ERR, EAL,
-
"Failed to create thread for interrupt handling\n");
-
} else {
-
/* Set thread_name for aid in debugging. */
-
snprintf(thread_name, RTE_MAX_THREAD_NAME_LEN,
-
"eal-intr-thread");
-
ret_1 = rte_thread_setname(intr_thread, thread_name);
-
if (ret_1 != 0)
-
RTE_LOG(DEBUG, EAL,
-
"Failed to set thread name for interrupt handling\n");
-
}
-
-
return -ret;
-
}
在继续分析之前先看下intr_sources这个全局链表的样子,如下图所示:
链表由struct
rte_intr_source结构组成,每个struct
rte_intr_source结构描述一个设备的中断信息。而struct
rte_intr_source中又有三个重要成员:
l intr_handle
-
struct rte_intr_handle {
-
RTE_STD_C11
-
union {
-
int vfio_dev_fd; /**< VFIO device file descriptor */
-
int uio_cfg_fd; /**< UIO config file descriptor
-
for uio_pci_generic */
-
};
-
int fd; /**< interrupt event file descriptor */
-
enum rte_intr_handle_type type; /**< handle type */
-
uint32_t max_intr; /* nb_efd+1 */
-
uint32_t nb_efd; /* efds中有效的个数 */
-
int efds[RTE_MAX_RXTX_INTR_VEC_ID]; /*传递中断的fd,每个队列一个 */
-
struct rte_epoll_event elist[RTE_MAX_RXTX_INTR_VEC_ID];
-
/**< intr vector epoll event */
-
int *intr_vec; /**< intr vector number array,每个队列ring 的offset*/
-
};
这个结构用来记录设备中断的相关信息,其中主要是设备每个队列对应的传递中的fd,如(uio或vfio暴露给用户态的文件打开fd)。当然较新的dpdk(如18.05)虚拟设备也可以支持中断,如vhost_user后端设备。如果对vhost_user设备的rte_intr_handle进行初始化,可以如下进行:
-
static int
-
eth_vhost_install_intr(struct rte_eth_dev *dev)
-
{
-
struct rte_vhost_vring vring;
-
struct vhost_queue *vq;
-
int count = 0;
-
int nb_rxq = dev->data->nb_rx_queues;
-
int i;
-
int ret;
-
-
dev->intr_handle = malloc(sizeof(*dev->intr_handle));
-
memset(dev->intr_handle, 0, sizeof(*dev->intr_handle));
-
dev->intr_handle->intr_vec =
-
malloc(nb_rxq * sizeof(dev->intr_handle->intr_vec[0]));
-
-
for (i = 0; i < nb_rxq; i++) {
-
vq = dev->data->rx_queues[i];
-
if (!vq)
-
continue;
-
ret = rte_vhost_get_vhost_vring(vq->vid, i << 1, &vring);
-
dev->intr_handle->intr_vec[i] = RTE_INTR_VEC_RXTX_OFFSET + i;
-
dev->intr_handle->efds[i] = vring.callfd; /*对于vhost_user设备这里就使用callfd接收来自前端的中断*/
-
count++;
-
}
-
-
dev->intr_handle->nb_efd = count;
-
dev->intr_handle->max_intr = count + 1;
-
dev->intr_handle->type = RTE_INTR_HANDLE_VDEV;
-
-
return 0;
-
}
l callbacks
这是一个rte_intr_callback结构组成的链表,主要保存设备的中断处理函数和参数信息。为什么要一个链表呢?因为可以对一个中断注册多个处理函数。
l active
描述设备中断的状态。设备上是否有未处理的中断。
下面来看eal_intr_thread_main函数,也就是中断线程的主体函数。
l eal_intr_thread_main
中断线程执行主体eal_intr_thread_main()函数具体如下:
(1) epoll_create()创建epoll模型。
(2) 将intr_pipe管道加入到epoll中。
(3) 遍历intr_sources链表,将所有UIO设备加入到epoll中。
(4) 执行eal_intr_handle_interrupts()函数。
l eal_intr_thread_main
-
static __attribute__((noreturn)) void * eal_intr_thread_main(__rte_unused void *arg)
-
{
-
struct epoll_event ev;
-
-
/* host thread, never break out */
-
for (;;) {
-
/* build up the epoll fd with all descriptors we are to
-
* wait on then pass it to the handle_interrupts function
-
*/
-
static struct epoll_event pipe_event = {
-
.events = EPOLLIN | EPOLLPRI,
-
};
-
struct rte_intr_source *src;
-
unsigned numfds = 0;
-
-
/* create epoll fd */
-
int pfd = epoll_create(1);
-
if (pfd < 0)
-
rte_panic("Cannot create epoll instance\n");
-
/*intr_pipe是一个全局变量,在rte_eal_intr_init中已经初始化*/
-
pipe_event.data.fd = intr_pipe.readfd;
-
/**
-
* add pipe fd into wait list, this pipe is used to
-
* rebuild the wait list.
-
*/
-
/*将intr_pipe.readfd添加到epoll的监听列表*/
-
if (epoll_ctl(pfd, EPOLL_CTL_ADD, intr_pipe.readfd,
-
&pipe_event) < 0) {
-
rte_panic("Error adding fd to %d epoll_ctl, %s\n",
-
intr_pipe.readfd, strerror(errno));
-
}
-
numfds++;
-
-
rte_spinlock_lock(&intr_lock);
-
/*遍历intr_sources链表,将所有设备的中断通知fd加入到epoll中*/
-
TAILQ_FOREACH(src, &intr_sources, next) {
-
if (src->callbacks.tqh_first == NULL)
-
continue; /* skip those with no callbacks */
-
ev.events = EPOLLIN | EPOLLPRI;
-
ev.data.fd = src->intr_handle.fd;
-
-
/**
-
* add all the uio device file descriptor
-
* into wait list.
-
*/
-
if (epoll_ctl(pfd, EPOLL_CTL_ADD,
-
src->intr_handle.fd, &ev) < 0){
-
rte_panic("Error adding fd %d epoll_ctl, %s\n",
-
src->intr_handle.fd, strerror(errno));
-
}
-
else
-
numfds++;
-
}
-
rte_spinlock_unlock(&intr_lock);
-
/* serve the interrupt */
-
eal_intr_handle_interrupts(pfd, numfds);
-
-
/**
-
* when we return, we need to rebuild the
-
* list of fds to monitor.
-
*/
-
close(pfd);
-
}
-
}
然后函数调用eal_intr_handle_interrupts。
l eal_intr_handle_interrupts
eal_intr_handle_interrupts主要就是在死循环中调用epoll,然后处理中断。
-
static void eal_intr_handle_interrupts(int pfd, unsigned totalfds)
-
{
-
struct epoll_event events[totalfds];
-
int nfds = 0;
-
-
for(;;) {
-
nfds = epoll_wait(pfd, events, totalfds,
-
EAL_INTR_EPOLL_WAIT_FOREVER);
-
/* epoll_wait fail */
-
if (nfds < 0) {
-
if (errno == EINTR)
-
continue;
-
RTE_LOG(ERR, EAL,
-
"epoll_wait returns with fail\n");
-
return;
-
}
-
/* epoll_wait timeout, will never happens here */
-
else if (nfds == 0)
-
continue;
-
/* epoll_wait has at least one fd ready to read */
-
/* 注意只有这里返回小于0,这个无限循环才会退出 */
-
if (eal_intr_process_interrupts(events, nfds) < 0)
-
return;
-
}
-
}
这个函数在一个for(;;)死循环中,调用epoll_wait()阻塞模式监听事件。如果有事件发生,则调用eal_intr_process_interrupts()函数。
l eal_intr_process_interrupts
-
static int eal_intr_process_interrupts(struct epoll_event *events, int nfds)
-
{
-
int n, bytes_read;
-
struct rte_intr_source *src;
-
struct rte_intr_callback *cb;
-
union rte_intr_read_buffer buf;
-
struct rte_intr_callback active_cb;
-
-
for (n = 0; n < nfds; n++) {
-
-
/**
-
* if the pipe fd is ready to read, return out to
-
* rebuild the wait list.
-
*/
-
/*如果是pipefd有数据,说明有新注册的中断,返回-1让上层退出无限循环,重新扫描intr_sources 链表,添加中断fd*/
-
if (events[n].data.fd == intr_pipe.readfd){
-
int r = read(intr_pipe.readfd, buf.charbuf,
-
sizeof(buf.charbuf));
-
RTE_SET_USED(r);
-
return -1;
-
}
-
rte_spinlock_lock(&intr_lock);
-
/*遍历intr_sources 链表,处理上面的中断*/
-
TAILQ_FOREACH(src, &intr_sources, next)
-
if (src->intr_handle.fd ==
-
events[n].data.fd) /*判断设备是否产生了中断*/
-
break;
-
if (src == NULL){
-
rte_spinlock_unlock(&intr_lock);
-
continue;
-
}
-
-
/* mark this interrupt source as active and release the lock. */
-
src->active = 1; /*表明这个设备的中断尚未处理*/
-
rte_spinlock_unlock(&intr_lock);
-
-
/* set the length to be read dor different handle type */
-
/*根据中断设备的类型,UIO或者vfio等,设置要读取数据的大小*/
-
switch (src->intr_handle.type) {
-
case RTE_INTR_HANDLE_UIO:
-
case RTE_INTR_HANDLE_UIO_INTX:
-
bytes_read = sizeof(buf.uio_intr_count);
-
break;
-
case RTE_INTR_HANDLE_ALARM:
-
bytes_read = sizeof(buf.timerfd_num);
-
break;
-
#ifdef VFIO_PRESENT
-
case RTE_INTR_HANDLE_VFIO_MSIX:
-
case RTE_INTR_HANDLE_VFIO_MSI:
-
case RTE_INTR_HANDLE_VFIO_LEGACY:
-
bytes_read = sizeof(buf.vfio_intr_count);
-
break;
-
#endif
-
case RTE_INTR_HANDLE_EXT:
-
default:
-
bytes_read = 1;
-
break;
-
}
-
/*从uio或vfio中断设备中读取中断数据*/
-
if (src->intr_handle.type != RTE_INTR_HANDLE_EXT) {
-
/**
-
* read out to clear the ready-to-be-read flag
-
* for epoll_wait.
-
*/
-
bytes_read = read(events[n].data.fd, &buf, bytes_read);
-
if (bytes_read < 0) {
-
if (errno == EINTR || errno == EWOULDBLOCK)
-
continue;
-
-
RTE_LOG(ERR, EAL, "Error reading from file "
-
"descriptor %d: %s\n",
-
events[n].data.fd,
-
strerror(errno));
-
} else if (bytes_read == 0)
-
RTE_LOG(ERR, EAL, "Read nothing from file "
-
"descriptor %d\n", events[n].data.fd);
-
}
-
-
/* grab a lock, again to call callbacks and update status. */
-
rte_spinlock_lock(&intr_lock);
-
/*调用中断设备自己的中断处理函数*/
-
if (bytes_read > 0) {
-
-
/* Finally, call all callbacks. */
-
/* 注意是调用这个设备注册的所有中断处理函数 */
-
TAILQ_FOREACH(cb, &src->callbacks, next) {
-
-
/* make a copy and unlock. */
-
active_cb = *cb;
-
rte_spinlock_unlock(&intr_lock);
-
-
/* call the actual callback */
-
active_cb.cb_fn(&src->intr_handle,
-
active_cb.cb_arg);
-
-
/*get the lock back. */
-
rte_spinlock_lock(&intr_lock);
-
}
-
}
-
-
/* we done with that interrupt source, release it. */
-
src->active = 0; /*处理完中断后清除设备中断状态*/
-
rte_spinlock_unlock(&intr_lock);
-
}
-
-
return 0;
-
}
到此设备中断的相关初始化就结束了,整个过程如下所示:
3.5.2 设备中断注册
那么中断又是怎么注册的呢?这就不得不提rte_intr_callback_register这个函数,设备的中断处理都是通过这个函数注册的,我们看下他的实现。
l rte_intr_callback_register
-
int
-
rte_intr_callback_register(const struct rte_intr_handle *intr_handle,
-
rte_intr_callback_fn cb, void *cb_arg)
-
{
-
int ret, wake_thread;
-
struct rte_intr_source *src;
-
struct rte_intr_callback *callback;
-
-
wake_thread = 0;
-
-
/* first do parameter checking */
-
if (intr_handle == NULL || intr_handle->fd < 0 || cb == NULL) {
-
RTE_LOG(ERR, EAL,
-
"Registering with invalid input parameter\n");
-
return -EINVAL;
-
}
-
-
/* allocate a new interrupt callback entity */
-
callback = rte_zmalloc("interrupt callback list",
-
sizeof(*callback), 0);
-
if (callback == NULL) {
-
RTE_LOG(ERR, EAL, "Can not allocate memory\n");
-
return -ENOMEM;
-
}
-
/* 初始化callback */
-
callback->cb_fn = cb;
-
callback->cb_arg = cb_arg;
-
-
rte_spinlock_lock(&intr_lock);
-
-
/* check if there is at least one callback registered for the fd */
-
/* 遍历intr_sources链表,找对应的rte_intr_source */
-
TAILQ_FOREACH(src, &intr_sources, next) {
-
if (src->intr_handle.fd == intr_handle->fd) {
-
/* we had no interrupts for this */
-
/* 如果这个设备的这个中断之前没有注册过处理函数,则需要唤醒中断处理线程,将这个中断fd添加到epoll中 */
-
if TAILQ_EMPTY(&src->callbacks)
-
wake_thread = 1;
-
/* 如果这个中断已经有对应的处理函数了,说明已经在epoll中了,则只需要把新的callback加入链表 */
-
TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
-
ret = 0;
-
break;
-
}
-
}
-
/* 如果没有设备对应的rte_intr_source结构,则创建一个并添加到全局链表 */
-
/* no existing callbacks for this - add new source */
-
if (src == NULL) {
-
if ((src = rte_zmalloc("interrupt source list",
-
sizeof(*src), 0)) == NULL) {
-
RTE_LOG(ERR, EAL, "Can not allocate memory\n");
-
rte_free(callback);
-
ret = -ENOMEM;
-
} else {
-
src->intr_handle = *intr_handle;
-
TAILQ_INIT(&src->callbacks);
-
TAILQ_INSERT_TAIL(&(src->callbacks), callback, next);
-
TAILQ_INSERT_TAIL(&intr_sources, src, next);
-
wake_thread = 1;
-
ret = 0;
-
}
-
}
-
-
rte_spinlock_unlock(&intr_lock);
-
-
/**
-
* check if need to notify the pipe fd waited by epoll_wait to
-
* rebuild the wait list.
-
*/
-
if (wake_thread) /* 唤醒中断处理线程 */
-
if (write(intr_pipe.writefd, "1", 1) < 0)
-
return -EPIPE;
-
-
return ret;
-
}
这个函数主要是为中断创建一个rte_intr_source结构,我们从其参数可以看出来,参数正式rte_intr_source结构成员所需要的,然后将rte_intr_source结构加入全局链表intr_sources中,并通知前面创建的中断处理线程,中断处理线程可以再次遍历intr_sources,将新加入的rte_intr_source中的handle->fd加入epoll中处理。整个处理流程如下所示。
下面列举一个uio/vfio设备的中断回调函数注册的完整路径,以ixgbevf为例:
rte_eth_dev_pci_probe àeth_ixgbevf_dev_inità rte_intr_callback_register,其中rte_eth_dev_pci_probe在下面的“绑定驱动”中会介绍。
对应ixgbevf其调用如下,这次的中断处理函数为ixgbevf_dev_interrupt_handler。
rte_intr_callback_register(intr_handle,ixgbevf_dev_interrupt_handler,
eth_dev);
l ixgbevf_dev_interrupt_handler
-
static void ixgbevf_dev_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
-
void *param)
-
{
-
struct rte_eth_dev *dev = (struct rte_eth_dev *)param;
-
/*暂时先禁止中断*/
-
ixgbevf_dev_interrupt_get_status(dev);
-
ixgbevf_dev_interrupt_action(dev);
-
}
l ixgbevf_dev_interrupt_action
其中主要是SRIOV设备,mailbox的处理,这里不再展开。
-
static int ixgbevf_dev_interrupt_action(struct rte_eth_dev *dev)
-
{
-
struct ixgbe_hw *hw = IXGBE_DEV_PRIVATE_TO_HW(dev->data->dev_private);
-
struct ixgbe_interrupt *intr =
-
IXGBE_DEV_PRIVATE_TO_INTR(dev->data->dev_private);
-
-
if (intr->flags & IXGBE_FLAG_MAILBOX) {
-
ixgbevf_mbx_process(dev);
-
intr->flags &= ~IXGBE_FLAG_MAILBOX;
-
}
-
/*开启中断*/
-
ixgbevf_intr_enable(hw);
-
-
return 0;
-
}
3.5.3 接收队列中断注册
我们上面讲了设备的中断注册,但是上面所说的中断注册一般不是数据中断,而是控制中断,比如设备状态改变等情况。这种中断我们一般会设置intr_handle->fd,如上面的描述,但是如果我们想要注册设备的接收队列中断呢(rxq interrupt),由于设备可能是多队列,那么显然一个fd是不够的,所以我们可以像上面为vhost_user设备注册中断一样(eth_vhost_install_intr)使用intr_handle->efds这个数组为每个rxq设置一个中断fd。但是这就有个问题,我们在“中断初始化”中分析eal_intr_thread_main中讲过,中断处理线程仅会将intr_handle->fd加入epoll中,但是并不会添加intr_handle->efds。那我们设置intr_handle->efds该怎么使用呢?其实这就涉及到数据面的中断注册了,一个非常好的例子是dpdk代码中的examples\l3fwd-power。
普通的DPDK是采用的PMD模式,也就是轮询模式,这种模式下无论是否有报文处理,都是采用的轮询也就是CPU占用率100%;l3fwd-power就是为了解决这个问题,当CPU根本就不需要处理报文的时候进入省电模式也就是中断模式。我们这里只关注其中的中断注册,其他暂时不去分析。设备的rxq中断是从event_register注册的。
l event_register
-
static int event_register(struct lcore_conf *qconf)
-
{
-
struct lcore_rx_queue *rx_queue;
-
uint8_t portid, queueid;
-
uint32_t data;
-
int ret;
-
int i;
-
/* 为设备的每个接收队列调用rte_eth_dev_rx_intr_ctl_q注册中断 */
-
for (i = 0; i < qconf->n_rx_queue; ++i) {
-
rx_queue = &(qconf->rx_queue_list[i]);
-
portid = rx_queue->port_id;
-
queueid = rx_queue->queue_id;
-
data = portid << CHAR_BIT | queueid;
-
-
ret = rte_eth_dev_rx_intr_ctl_q(portid, queueid,
-
RTE_EPOLL_PER_THREAD,
-
RTE_INTR_EVENT_ADD,
-
(void *)((uintptr_t)data));
-
if (ret)
-
return ret;
-
}
-
-
return 0;
-
}
其中注册每个rxq的中断由rte_eth_dev_rx_intr_ctl_q函数完成,注意RTE_EPOLL_PER_THREAD的值为-1。
l rte_eth_dev_rx_intr_ctl_q
-
int rte_eth_dev_rx_intr_ctl_q(uint8_t port_id, uint16_t queue_id,
-
int epfd, int op, void *data)
-
{
-
uint32_t vec;
-
struct rte_eth_dev *dev;
-
struct rte_intr_handle *intr_handle;
-
int rc;
-
-
RTE_ETH_VALID_PORTID_OR_ERR_RET(port_id, -ENODEV);
-
/* 根据port_id找到对应的struct rte_eth_dev */
-
dev = &rte_eth_devices[port_id];
-
if (queue_id >= dev->data->nb_rx_queues) {
-
RTE_PMD_DEBUG_TRACE("Invalid RX queue_id=%u\n", queue_id);
-
return -EINVAL;
-
}
-
/* 检查设备是否初始化了intr_handle */
-
if (!dev->intr_handle) {
-
RTE_PMD_DEBUG_TRACE("RX Intr handle unset\n");
-
return -ENOTSUP;
-
}
-
-
intr_handle = dev->intr_handle;
-
if (!intr_handle->intr_vec) {
-
RTE_PMD_DEBUG_TRACE("RX Intr vector unset\n");
-
return -EPERM;
-
}
-
/* intr_handle->intr_vec[queue_id]为queue的ring idx */
-
vec = intr_handle->intr_vec[queue_id];
-
rc = rte_intr_rx_ctl(intr_handle, epfd, op, vec, data);
-
if (rc && rc != -EEXIST) {
-
RTE_PMD_DEBUG_TRACE("p %u q %u rx ctl error"
-
" op %d epfd %d vec %u\n",
-
port_id, queue_id, op, epfd, vec);
-
return rc;
-
}
-
-
return 0;
-
}
这个函数调用了一系列检查,最终调用rte_intr_rx_ctl完成中断fd注册。在看rte_intr_rx_ctl实现之前,先看下rte_intr_handle的之前没展开的细节结构,如下所示。
l rte_intr_rx_ctl
-
int
-
rte_intr_rx_ctl(struct rte_intr_handle *intr_handle, int epfd,
-
int op, unsigned int vec, void *data)
-
{
-
struct rte_epoll_event *rev;
-
struct rte_epoll_data *epdata;
-
int epfd_op;
-
unsigned int efd_idx;
-
int rc = 0;
-
-
efd_idx = (vec >= RTE_INTR_VEC_RXTX_OFFSET) ?
-
(vec - RTE_INTR_VEC_RXTX_OFFSET) : vec;
-
-
if (!intr_handle || intr_handle->nb_efd == 0 ||
-
efd_idx >= intr_handle->nb_efd) {
-
RTE_LOG(ERR, EAL, "Wrong intr vector number.\n");
-
return -EPERM;
-
}
-
-
switch (op) {
-
case RTE_INTR_EVENT_ADD:
-
epfd_op = EPOLL_CTL_ADD;
-
rev = &intr_handle->elist[efd_idx];
-
/* rev->status != RTE_EPOLL_INVALID说明这个中断fd已经加入了epoll了 */
-
if (rev->status != RTE_EPOLL_INVALID) {
-
RTE_LOG(INFO, EAL, "Event already been added.\n");
-
return -EEXIST;
-
}
-
/* 设置intr_handle->elist[efd_idx].epdata */
-
/* attach to intr vector fd */
-
epdata = &rev->epdata;
-
epdata->event = EPOLLIN | EPOLLPRI | EPOLLET;
-
epdata->data = data;
-
epdata->cb_fun = (rte_intr_event_cb_t)eal_intr_proc_rxtx_intr;
-
epdata->cb_arg = (void *)intr_handle;
-
/* 注意这里传入的是intr_handle->efds[efd_idx] */
-
rc = rte_epoll_ctl(epfd, epfd_op,
-
intr_handle->efds[efd_idx], rev);
-
if (!rc)
-
RTE_LOG(DEBUG, EAL,
-
"efd %d associated with vec %d added on epfd %d"
-
"\n", rev->fd, vec, epfd);
-
else
-
rc = -EPERM;
-
break;
-
case RTE_INTR_EVENT_DEL:
-
epfd_op = EPOLL_CTL_DEL;
-
rev = &intr_handle->elist[efd_idx];
-
if (rev->status == RTE_EPOLL_INVALID) {
-
RTE_LOG(INFO, EAL, "Event does not exist.\n");
-
return -EPERM;
-
}
-
-
rc = rte_epoll_ctl(rev->epfd, epfd_op, rev->fd, rev);
-
if (rc)
-
rc = -EPERM;
-
break;
-
default:
-
RTE_LOG(ERR, EAL, "event op type mismatch\n");
-
rc = -EPERM;
-
}
-
-
return rc;
-
}
由于是中断注册,我们只关注RTE_INTR_EVENT_ADD的逻辑,这里我们终于看到了intr_handle->efds[efd_idx],通过rte_epoll_ctl进行注册,同时我们也看到了这里会初始化一个中断处理函数eal_intr_proc_rxtx_intr,这个我们后面分析。
l rte_epoll_ctl
-
int
-
rte_epoll_ctl(int epfd, int op, int fd,
-
struct rte_epoll_event *event)
-
{
-
struct epoll_event ev;
-
-
if (!event) {
-
RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
-
return -1;
-
}
-
-
/* using per thread epoll fd */
-
/*如果epfd为-1,则创建epollfd,注意这里把epollfd存放在了“每线程变量中”*/
-
if (epfd == RTE_EPOLL_PER_THREAD)
-
epfd = rte_intr_tls_epfd();
-
-
if (op == EPOLL_CTL_ADD) {
-
event->status = RTE_EPOLL_VALID;
-
event->fd = fd; /* ignore fd in event */
-
event->epfd = epfd;
-
ev.data.ptr = (void *)event;
-
}
-
-
ev.events = event->epdata.event;
-
/*添加到epoll中*/
-
if (epoll_ctl(epfd, op, fd, &ev) < 0) {
-
RTE_LOG(ERR, EAL, "Error op %d fd %d epoll_ctl, %s\n",
-
op, fd, strerror(errno));
-
if (op == EPOLL_CTL_ADD)
-
/* rollback status when CTL_ADD fail */
-
event->status = RTE_EPOLL_INVALID;
-
return -1;
-
}
-
-
if (op == EPOLL_CTL_DEL && event->status != RTE_EPOLL_INVALID)
-
eal_epoll_data_safe_free(event);
-
-
return 0;
-
}
这个函数主要就是创建一个per thread的epollfd,然后调用了epoll_ctl来讲rxq的fd加入epollfd。到此中断注册就完成了。下面我们看中断回调过程。整个中断线程就是dataplane的的主线程。具体不再展开,调用路径如下所示。
这里我们主要看一下rte_epoll_wait的处理逻辑,之所以要对epoll_wait进行一次封装,主要是在epoll_wait返回后调用了eal_epoll_process_event。
l rte_epoll_wait
-
int rte_epoll_wait(int epfd, struct rte_epoll_event *events,
-
int maxevents, int timeout)
-
{
-
struct epoll_event evs[maxevents];
-
int rc;
-
-
if (!events) {
-
RTE_LOG(ERR, EAL, "rte_epoll_event can't be NULL\n");
-
return -1;
-
}
-
-
/* using per thread epoll fd */
-
/* 获取之前创建的epollfd */
-
if (epfd == RTE_EPOLL_PER_THREAD)
-
epfd = rte_intr_tls_epfd();
-
-
while (1) {
-
rc = epoll_wait(epfd, evs, maxevents, timeout);
-
if (likely(rc > 0)) {
-
/* epoll_wait has at least one fd ready to read */
-
rc = eal_epoll_process_event(evs, rc, events);
-
break;
-
} else if (rc < 0) {
-
if (errno == EINTR)
-
continue;
-
/* epoll_wait fail */
-
RTE_LOG(ERR, EAL, "epoll_wait returns with fail %s\n",
-
strerror(errno));
-
rc = -1;
-
break;
-
} else {
-
/* rc == 0, epoll_wait timed out */
-
break;
-
}
-
}
-
-
return rc;
-
}
l eal_epoll_process_event
-
static int
-
eal_epoll_process_event(struct epoll_event *evs, unsigned int n,
-
struct rte_epoll_event *events)
-
{
-
unsigned int i, count = 0;
-
struct rte_epoll_event *rev;
-
-
for (i = 0; i < n; i++) {
-
rev = evs[i].data.ptr;
-
if (!rev || !rte_atomic32_cmpset(&rev->status, RTE_EPOLL_VALID,
-
RTE_EPOLL_EXEC))
-
continue;
-
-
events[count].status = RTE_EPOLL_VALID;
-
events[count].fd = rev->fd;
-
events[count].epfd = rev->epfd;
-
events[count].epdata.event = rev->epdata.event;
-
events[count].epdata.data = rev->epdata.data;
-
if (rev->epdata.cb_fun)
-
rev->epdata.cb_fun(rev->fd,
-
rev->epdata.cb_arg);
-
-
rte_compiler_barrier();
-
rev->status = RTE_EPOLL_VALID;
-
count++;
-
}
-
return count;
-
}
而eal_epoll_process_event的主要逻辑就是调用之前rte_intr_rx_ctl中注册的epdata.cb_fun,也就是eal_intr_proc_rxtx_intr。
l eal_intr_proc_rxtx_intr
这个函数其实主要就是读出fd中的数据,以免下次将加入epoll中直接返回,当然这是dpdk 17.02的实现,在18.05中加入了RTE_INTR_HANDLE_VDEV,也就是之前我们注册vhost_user时使用的handle type,对应RTE_INTR_HANDLE_VDEV是不需要从fd读数据的,所以bytes_read为0。
-
static void eal_intr_proc_rxtx_intr(int fd, const struct rte_intr_handle *intr_handle)
-
{
-
union rte_intr_read_buffer buf;
-
int bytes_read = 1;
-
int nbytes;
-
-
switch (intr_handle->type) {
-
case RTE_INTR_HANDLE_UIO:
-
case RTE_INTR_HANDLE_UIO_INTX:
-
bytes_read = sizeof(buf.uio_intr_count);
-
break;
-
#ifdef VFIO_PRESENT
-
case RTE_INTR_HANDLE_VFIO_MSIX:
-
case RTE_INTR_HANDLE_VFIO_MSI:
-
case RTE_INTR_HANDLE_VFIO_LEGACY:
-
bytes_read = sizeof(buf.vfio_intr_count);
-
break;
-
#endif
-
default:
-
bytes_read = 1;
-
RTE_LOG(INFO, EAL, "unexpected intr type\n");
-
break;
-
}
-
-
/**
-
* read out to clear the ready-to-be-read flag
-
* for epoll_wait.
-
*/
-
do {
-
nbytes = read(fd, &buf, bytes_read);
-
if (nbytes < 0) {
-
if (errno == EINTR || errno == EWOULDBLOCK ||
-
errno == EAGAIN)
-
continue;
-
RTE_LOG(ERR, EAL,
-
"Error reading from fd %d: %s\n",
-
fd, strerror(errno));
-
} else if (nbytes == 0)
-
RTE_LOG(ERR, EAL, "Read nothing from fd %d\n", fd);
-
return;
-
} while (1);
-
}
之后就返回主线程了,主线程函数在rte_epoll_wait返回后调用收包逻辑处理。
下面是整个中断注册回调逻辑图。
阅读(13060) | 评论(4) | 转发(1) |