dpdk网卡驱动发送函数分析
——lvyilong316
这里主要分析一下dpdk网卡驱动的发送流程,如何将应用中mbuf中的数据DMA到网卡硬件。这里以Broadcom的bnxt驱动为例分析,代码来自dpdk 17.11。
bnxt的发送函数tx_pkt_burst被初始化为了bnxt_xmit_pkts。
eth_dev->tx_pkt_burst
= &bnxt_xmit_pkts;
l bnxt_xmit_pkts
-
uint16_t bnxt_xmit_pkts(void *tx_queue, struct rte_mbuf **tx_pkts,
-
uint16_t nb_pkts)
-
{
-
struct bnxt_tx_queue *txq = tx_queue;
-
uint16_t nb_tx_pkts = 0;
-
uint16_t db_mask = txq->tx_ring->tx_ring_struct->ring_size >> 2;
-
uint16_t last_db_mask = 0;
-
-
/* Handle TX completions */
-
bnxt_handle_tx_cp(txq);
-
-
/* Handle TX burst request */
-
for (nb_tx_pkts = 0; nb_tx_pkts < nb_pkts; nb_tx_pkts++) {
-
if (bnxt_start_xmit(tx_pkts[nb_tx_pkts], txq)) {
-
break;
-
} else if ((nb_tx_pkts & db_mask) != last_db_mask) {
-
B_TX_DB(txq->tx_ring->tx_doorbell,
-
txq->tx_ring->tx_prod);
-
last_db_mask = nb_tx_pkts & db_mask;
-
}
-
}
-
if (nb_tx_pkts)
-
B_TX_DB(txq->tx_ring->tx_doorbell, txq->tx_ring->tx_prod);
-
-
return nb_tx_pkts;
-
}
这个函数的逻辑可以分为三个部分来看:
首先是bnxt_handle_tx_cp,这里的cp是指complete(完成),这个函数主要负责处理之前网卡已经发送完成的mbuf,也就是网卡已经通过DMA将mbuf中的数据拷贝走,软件可以释放mbuf的逻辑;
其次是bnxt_start_xmit,这个是真正的发送逻辑,其实这里的发送也并不是真的把数据拷贝到网卡上,而是根据每个mbuf的数据地址设置到bd ring,从而告诉网卡DMA拷贝的源地址;
最后是B_TX_DB对tx_doorbell的写操作,作用就是前面bd的地址信息已经填充完毕,告诉网卡可以发起DMA了。
在分析具体函数前,首先熟悉一下相关的数据结构。和tx_queue相关联的有两个ring,一个是tx_ring(发送ring),一个是cp_ring(完成ring)。其数据结构关系如下:
下面分别介绍三个阶段的实现。
释放已经DMA完成的mbuf
l bnxt_handle_tx_cp
-
static int bnxt_handle_tx_cp(struct bnxt_tx_queue *txq)
-
{
-
struct bnxt_cp_ring_info *cpr = txq->cp_ring;
-
uint32_t raw_cons = cpr->cp_raw_cons; /* 记录完成队列上次完成队列释放(consumer)的index */
-
uint32_t cons;
-
int nb_tx_pkts = 0;
-
struct tx_cmpl *txcmp;
-
-
if ((txq->tx_ring->tx_ring_struct->ring_size -
-
(bnxt_tx_avail(txq->tx_ring))) >
-
txq->tx_free_thresh) { /*如果发送ring中的已用desc数量大于tx_free_thresh*/
-
while (1) {
-
cons = RING_CMP(cpr->cp_ring_struct, raw_cons);
-
txcmp = (struct tx_cmpl *)&cpr->cp_desc_ring[cons];
-
-
/* struct tx_cmpl中的type由硬件设置,TX_CMPL_TYPE_TX_L2表示网卡已经DMA完成,软件可以释放mbuf中的数据了 */
-
if (CMP_TYPE(txcmp) == TX_CMPL_TYPE_TX_L2)
-
nb_tx_pkts++;
-
else
-
RTE_LOG_DP(DEBUG, PMD,
-
"Unhandled CMP type %02x\n",
-
CMP_TYPE(txcmp));
-
raw_cons = NEXT_RAW_CMP(raw_cons); /* raw_cons = raw_cons + 1 */
-
}
-
if (nb_tx_pkts) /* nb_tx_pkts记录了本次可以释放的mbuf数量 */
-
bnxt_tx_cmp(txq, nb_tx_pkts); /* 释放mbuf */
-
cpr->cp_raw_cons = raw_cons; /* 更新 cpr->cp_raw_cons */
-
B_CP_DIS_DB(cpr, cpr->cp_raw_cons); /* 通过cp ring的cp_doorbell通知硬件对应的cp ring bd已经可以释放了*/
-
}
-
return nb_tx_pkts;
-
}
真正释放mbuf的操作是在bnxt_tx_cmp函数完成的。
l bnxt_tx_cmp
-
static void bnxt_tx_cmp(struct bnxt_tx_queue *txq, int nr_pkts)
-
{
-
struct bnxt_tx_ring_info *txr = txq->tx_ring;
-
uint16_t cons = txr->tx_cons;
-
int i, j;
-
-
for (i = 0; i < nr_pkts; i++) {
-
struct bnxt_sw_tx_bd *tx_buf;
-
struct rte_mbuf *mbuf;
-
-
tx_buf = &txr->tx_buf_ring[cons]; /*tx_buf_ring存放txring中的mbuf*/
-
cons = RING_NEXT(txr->tx_ring_struct, cons);
-
mbuf = tx_buf->mbuf;
-
tx_buf->mbuf = NULL;
-
-
/* EW - no need to unmap DMA memory? */
-
/* tx_buf->nr_bds记录一个mbuf对应的bd数量,一个mbuf可能对应多个bd */
-
for (j = 1; j < tx_buf->nr_bds; j++)
-
cons = RING_NEXT(txr->tx_ring_struct, cons); /* cons = cons + 1 */
-
rte_pktmbuf_free(mbuf);
-
}
-
-
txr->tx_cons = cons; /* 清空了一部分mbuf,更新consumer index */
-
}
这里注意一点,在函数的最后更新tx_ring的consumer index,虽然对于发送端来说,软件驱动是productor(产生数据),网卡是consumer(消费数据),但是真正释放数据还是由软件驱动完成,所以consumer也是要在软件更新的。
数据包发送
数据包发送是在bnxt_start_xmit中完成的。
l bnxt_start_xmit
-
static uint16_t bnxt_start_xmit(struct rte_mbuf *tx_pkt,
-
struct bnxt_tx_queue *txq)
-
{
-
struct bnxt_tx_ring_info *txr = txq->tx_ring;
-
struct tx_bd_long *txbd;
-
struct tx_bd_long_hi *txbd1;
-
uint32_t vlan_tag_flags, cfa_action;
-
bool long_bd = false;
-
uint16_t last_prod = 0;
-
struct rte_mbuf *m_seg;
-
struct bnxt_sw_tx_bd *tx_buf;
-
static const uint32_t lhint_arr[4] = {
-
TX_BD_LONG_FLAGS_LHINT_LT512,
-
TX_BD_LONG_FLAGS_LHINT_LT1K,
-
TX_BD_LONG_FLAGS_LHINT_LT2K,
-
TX_BD_LONG_FLAGS_LHINT_LT2K
-
};
-
-
if (tx_pkt->ol_flags & (PKT_TX_TCP_SEG | PKT_TX_TCP_CKSUM |
-
PKT_TX_UDP_CKSUM | PKT_TX_IP_CKSUM |
-
PKT_TX_VLAN_PKT | PKT_TX_OUTER_IP_CKSUM))
-
long_bd = true;
-
/* 1. 将待发送的mbuf放入tx_ring的bnxt_sw_tx_bd中 */
-
tx_buf = &txr->tx_buf_ring[txr->tx_prod];
-
tx_buf->mbuf = tx_pkt;
-
tx_buf->nr_bds = long_bd + tx_pkt->nb_segs;
-
/* 一个mbuf可能对应多个bd,last_prod指向该mbuf对应的最后一个bd的index */
-
last_prod = (txr->tx_prod + tx_buf->nr_bds - 1) &
-
txr->tx_ring_struct->ring_mask;
-
-
if (unlikely(bnxt_tx_avail(txr) < tx_buf->nr_bds))
-
return -ENOMEM;
-
/* 2. 根据mbuf的信息设置rx tx_desc_ring中对应的bd,其中关键是txbd->addr */
-
txbd = &txr->tx_desc_ring[txr->tx_prod];
-
txbd->opaque = txr->tx_prod;
-
txbd->flags_type = tx_buf->nr_bds << TX_BD_LONG_FLAGS_BD_CNT_SFT;
-
txbd->len = tx_pkt->data_len;
-
if (txbd->len >= 2014)
-
txbd->flags_type |= TX_BD_LONG_FLAGS_LHINT_GTE2K;
-
else
-
txbd->flags_type |= lhint_arr[txbd->len >> 9];
-
/* txbd->addr是mbuf的dma地址,也就是iova地址 */
-
txbd->addr = rte_cpu_to_le_32(RTE_MBUF_DATA_DMA_ADDR(tx_buf->mbuf)); /* txr->tx_prod = txr->tx_prod + 1 */
-
-
if (long_bd) {
-
txbd->flags_type |= TX_BD_LONG_TYPE_TX_BD_LONG;
-
vlan_tag_flags = 0;
-
cfa_action = 0;
-
if (tx_buf->mbuf->ol_flags & PKT_TX_VLAN_PKT) {
-
/* shurd: Should this mask at
-
* TX_BD_LONG_CFA_META_VLAN_VID_MASK?
-
*/
-
vlan_tag_flags = TX_BD_LONG_CFA_META_KEY_VLAN_TAG |
-
tx_buf->mbuf->vlan_tci;
-
/* Currently supports 8021Q, 8021AD vlan offloads
-
* QINQ1, QINQ2, QINQ3 vlan headers are deprecated
-
*/
-
/* DPDK only supports 802.11q VLAN packets */
-
vlan_tag_flags |=
-
TX_BD_LONG_CFA_META_VLAN_TPID_TPID8100;
-
}
-
/* 更新tx_ring的productor index */
-
txr->tx_prod = RING_NEXT(txr->tx_ring_struct, txr->tx_prod);
-
-
txbd1 = (struct tx_bd_long_hi *)
-
&txr->tx_desc_ring[txr->tx_prod];
-
txbd1->lflags = 0;
-
txbd1->cfa_meta = vlan_tag_flags;
-
txbd1->cfa_action = cfa_action;
-
/* 根据mbuf的ol_flags设置bd中对应的flag */
-
if (tx_pkt->ol_flags & PKT_TX_TCP_SEG) {
-
/* TSO */
-
txbd1->lflags |= TX_BD_LONG_LFLAGS_LSO;
-
txbd1->hdr_size = tx_pkt->l2_len + tx_pkt->l3_len +
-
tx_pkt->l4_len + tx_pkt->outer_l2_len +
-
tx_pkt->outer_l3_len;
-
txbd1->mss = tx_pkt->tso_segsz;
-
-
} else if ((tx_pkt->ol_flags & PKT_TX_OIP_IIP_TCP_UDP_CKSUM) ==
-
PKT_TX_OIP_IIP_TCP_UDP_CKSUM) {
-
/* Outer IP, Inner IP, Inner TCP/UDP CSO */
-
txbd1->lflags |= TX_BD_FLG_TIP_IP_TCP_UDP_CHKSUM;
-
txbd1->mss = 0;
-
} else if ((tx_pkt->ol_flags & PKT_TX_IIP_TCP_UDP_CKSUM) ==
-
PKT_TX_IIP_TCP_UDP_CKSUM) {
-
/* (Inner) IP, (Inner) TCP/UDP CSO */
-
txbd1->lflags |= TX_BD_FLG_IP_TCP_UDP_CHKSUM;
-
txbd1->mss = 0;
-
} else if ((tx_pkt->ol_flags & PKT_TX_OIP_TCP_UDP_CKSUM) ==
-
PKT_TX_OIP_TCP_UDP_CKSUM) {
-
/* Outer IP, (Inner) TCP/UDP CSO */
-
txbd1->lflags |= TX_BD_FLG_TIP_TCP_UDP_CHKSUM;
-
txbd1->mss = 0;
-
} else if ((tx_pkt->ol_flags & PKT_TX_OIP_IIP_CKSUM) ==
-
PKT_TX_OIP_IIP_CKSUM) {
-
/* Outer IP, Inner IP CSO */
-
txbd1->lflags |= TX_BD_FLG_TIP_IP_CHKSUM;
-
txbd1->mss = 0;
-
} else if ((tx_pkt->ol_flags & PKT_TX_TCP_UDP_CKSUM) ==
-
PKT_TX_TCP_UDP_CKSUM) {
-
/* TCP/UDP CSO */
-
txbd1->lflags |= TX_BD_LONG_LFLAGS_TCP_UDP_CHKSUM;
-
txbd1->mss = 0;
-
} else if (tx_pkt->ol_flags & PKT_TX_IP_CKSUM) {
-
/* IP CSO */
-
txbd1->lflags |= TX_BD_LONG_LFLAGS_IP_CHKSUM;
-
txbd1->mss = 0;
-
} else if (tx_pkt->ol_flags & PKT_TX_OUTER_IP_CKSUM) {
-
/* IP CSO */
-
txbd1->lflags |= TX_BD_LONG_LFLAGS_T_IP_CHKSUM;
-
txbd1->mss = 0;
-
}
-
} else {
-
txbd->flags_type |= TX_BD_SHORT_TYPE_TX_BD_SHORT;
-
}
-
-
m_seg = tx_pkt->next;
-
/* i is set at the end of the if(long_bd) block */
-
while (txr->tx_prod != last_prod) {
-
/* 更新tx_ring的productor index */
-
txr->tx_prod = RING_NEXT(txr->tx_ring_struct, txr->tx_prod); /* txr->tx_prod = txr->tx_prod + 1 */
-
tx_buf = &txr->tx_buf_ring[txr->tx_prod];
-
-
txbd = &txr->tx_desc_ring[txr->tx_prod];
-
txbd->addr = rte_cpu_to_le_32(RTE_MBUF_DATA_DMA_ADDR(m_seg));
-
txbd->flags_type = TX_BD_SHORT_TYPE_TX_BD_SHORT;
-
txbd->len = m_seg->data_len;
-
-
m_seg = m_seg->next;
-
}
-
-
txbd->flags_type |= TX_BD_LONG_FLAGS_PACKET_END;
-
/* 更新tx_ring的productor index */
-
txr->tx_prod = RING_NEXT(txr->tx_ring_struct, txr->tx_prod); /* txr->tx_prod = txr->tx_prod + 1 */
-
-
return 0;
-
}
其中值得注意的有两点,一个是mbuf向db(tx_bd_long)转换的过程,其bd地址设置为mbuf的iova地址,也就是dma地址。
-
txbd->addr = rte_cpu_to_le_32(RTE_MBUF_DATA_DMA_ADDR(tx_buf->mbuf));
-
#define RTE_MBUF_DATA_DMA_ADDR(mb) \
-
((uint64_t)((mb)->buf_iova + (mb)->data_off))
另一方面是在发送过程中会更新tx_ring的productor index。
启动DMA
启动硬件DMA拷贝是通过一下语句完成:
B_TX_DB(txq->tx_ring->tx_doorbell,
txq->tx_ring->tx_prod)
将tx_ring的productor index(tx_ring->tx_prod)写入tx_ring的tx_doorbell中。而无论是cp_ring的cp_doorbell还是tx_ring的tx_doorbell都在在bnxt_alloc_hwrm_rings函数中初始化为设备的bar空间地址的。
-
cpr->cp_doorbell = (char *)pci_dev->mem_resource[2].addr + idx * 0x80;
-
txr->tx_doorbell = (char *)pci_dev->mem_resource[2].addr + idx * 0x80;
阅读(15891) | 评论(0) | 转发(0) |