在做驱动的时候,写网卡驱动,最多会用到register_netdev , alloc_netdev这些常用的接口,然后就是结构体的初始化.至于注册进入内核后,是一个怎么样的流程,一直是一个觉的明白,说起来很模糊的状态.当然我也是一直参考《深入理解Linux网络内幕》作为学习的引导.参考内核2.6.32.60 .
首先我们看看alloc_netdev :include/linux/netdevice.h
#define alloc_netdev(sizeof_priv, name, setup) \
alloc_netdev_mq(sizeof_priv, name, setup, 1)
-
/**
-
* alloc_netdev_mq - allocate network device
-
* @sizeof_priv: size of private data to allocate space for
-
* @name: device name format string
-
* @setup: callback to initialize device
-
* @queue_count: the number of subqueues to allocate
-
*
-
* Allocates a struct net_device with private data area for driver use
-
* and performs basic initialization. Also allocates subquue structs
-
* for each queue on the device at the end of the netdevice.
-
*/
-
alloc_netdev
-
struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
-
void (*setup)(struct net_device *), unsigned int queue_count)
-
{
-
struct netdev_queue *tx;
-
struct net_device *dev;
-
size_t alloc_size;
-
struct net_device *p;
-
-
BUG_ON(strlen(name) >= sizeof(dev->name));
-
-
alloc_size = sizeof(struct net_device);
-
if (sizeof_priv) {
-
/* ensure 32-byte alignment of private area */
-
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
-
alloc_size += sizeof_priv;
-
}
-
/* ensure 32-byte alignment of whole construct */
-
alloc_size += NETDEV_ALIGN - 1;
-
-
p = kzalloc(alloc_size, GFP_KERNEL);
-
if (!p) {
-
printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
-
return NULL;
-
}
-
-
tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
-
if (!tx) {
-
printk(KERN_ERR "alloc_netdev: Unable to allocate "
-
"tx qdiscs.\n");
-
goto free_p;
-
}
-
-
dev = PTR_ALIGN(p, NETDEV_ALIGN);
-
dev->padded = (char *)dev - (char *)p;
-
-
if (dev_addr_init(dev))
-
goto free_tx;
-
-
dev_unicast_init(dev);
-
-
dev_net_set(dev, &init_net);
-
-
dev->_tx = tx;
-
dev->num_tx_queues = queue_count;
-
dev->real_num_tx_queues = queue_count;
-
-
dev->gso_max_size = GSO_MAX_SIZE;
-
-
netdev_init_queues(dev);
-
-
INIT_LIST_HEAD(&dev->napi_list);
-
dev->priv_flags = IFF_XMIT_DST_RELEASE;
-
setup(dev);
-
strcpy(dev->name, name);
-
return dev;
-
-
free_tx:
-
kfree(tx);
-
-
free_p:
-
kfree(p);
-
return NULL;
-
}
我们看到这个函数主要申请dev结构,初始化addr,dev->_tx 以及queues(netdev_init_queues(dev);
)。 并初始化dev->napi_list. 还有setup ,以太网默认是ether_setup.
这里我们关心一下
-
static void netdev_init_queues(struct net_device *dev)
-
{
-
netdev_init_one_queue(dev, &dev->rx_queue, NULL);
-
netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
-
spin_lock_init(&dev->tx_global_lock);
-
}
其实就是把收发队列里的dev指向当前设备.
当然还有很多变体api
这里我们也应该关注一下struct net_device这个结构体
-
/*
-
* The DEVICE structure.
-
* Actually, this whole structure is a big mistake. It mixes I/O
-
* data with strictly "high-level" data, and it has to know about
-
* almost every data structure used in the INET module.
-
*
-
* FIXME: cleanup struct net_device such that network protocol info
-
* moves out.
-
*/
-
-
struct net_device
-
{
-
-
/*
-
* This is the first field of the "visible" part of this structure
-
* (i.e. as seen by users in the "Space.c" file). It is the name
-
* the interface.
-
*/
-
char name[IFNAMSIZ];
-
/* device name hash chain */
-
struct hlist_node name_hlist;
-
/* snmp alias */
-
char *ifalias;
-
-
/*
-
* I/O specific fields
-
* FIXME: Merge these and struct ifmap into one
-
*/
-
unsigned long mem_end; /* shared mem end */
-
unsigned long mem_start; /* shared mem start */
-
unsigned long base_addr; /* device I/O address */
-
unsigned int irq; /* device IRQ number */
-
-
/*
-
* Some hardware also needs these fields, but they are not
-
* part of the usual set specified in Space.c.
-
*/
-
-
unsigned char if_port; /* Selectable AUI, TP,..*/
-
unsigned char dma; /* DMA channel */
-
-
unsigned long state;
-
-
struct list_head dev_list;
-
struct list_head napi_list;
-
-
/* Net device features */
-
unsigned long features;
-
#define NETIF_F_SG 1 /* Scatter/gather IO. */
-
#define NETIF_F_IP_CSUM 2 /* Can checksum TCP/UDP over IPv4. */
-
#define NETIF_F_NO_CSUM 4 /* Does not require checksum. F.e. loopack. */
-
#define NETIF_F_HW_CSUM 8 /* Can checksum all the packets. */
-
#define NETIF_F_IPV6_CSUM 16 /* Can checksum TCP/UDP over IPV6 */
-
#define NETIF_F_HIGHDMA 32 /* Can DMA to high memory. */
-
#define NETIF_F_FRAGLIST 64 /* Scatter/gather IO. */
-
#define NETIF_F_HW_VLAN_TX 128 /* Transmit VLAN hw acceleration */
-
#define NETIF_F_HW_VLAN_RX 256 /* Receive VLAN hw acceleration */
-
#define NETIF_F_HW_VLAN_FILTER 512 /* Receive filtering on VLAN */
-
#define NETIF_F_VLAN_CHALLENGED 1024 /* Device cannot handle VLAN packets */
-
#define NETIF_F_GSO 2048 /* Enable software GSO. */
-
#define NETIF_F_LLTX 4096 /* LockLess TX - deprecated. Please */
-
/* do not use LLTX in new drivers */
-
#define NETIF_F_NETNS_LOCAL 8192 /* Does not change network namespaces */
-
#define NETIF_F_GRO 16384 /* Generic receive offload */
-
#define NETIF_F_LRO 32768 /* large receive offload */
-
-
/* the GSO_MASK reserves bits 16 through 23 */
-
#define NETIF_F_FCOE_CRC (1 << 24) /* FCoE CRC32 */
-
#define NETIF_F_SCTP_CSUM (1 << 25) /* SCTP checksum offload */
-
#define NETIF_F_FCOE_MTU (1 << 26) /* Supports max FCoE MTU, 2158 bytes*/
-
-
/* Segmentation offload features */
-
#define NETIF_F_GSO_SHIFT 16
-
#define NETIF_F_GSO_MASK 0x00ff0000
-
#define NETIF_F_TSO (SKB_GSO_TCPV4 << NETIF_F_GSO_SHIFT)
-
#define NETIF_F_UFO (SKB_GSO_UDP << NETIF_F_GSO_SHIFT)
-
#define NETIF_F_GSO_ROBUST (SKB_GSO_DODGY << NETIF_F_GSO_SHIFT)
-
#define NETIF_F_TSO_ECN (SKB_GSO_TCP_ECN << NETIF_F_GSO_SHIFT)
-
#define NETIF_F_TSO6 (SKB_GSO_TCPV6 << NETIF_F_GSO_SHIFT)
-
#define NETIF_F_FSO (SKB_GSO_FCOE << NETIF_F_GSO_SHIFT)
-
-
/* List of features with software fallbacks. */
-
#define NETIF_F_GSO_SOFTWARE (NETIF_F_TSO | NETIF_F_TSO_ECN | NETIF_F_TSO6)
-
-
-
#define NETIF_F_GEN_CSUM (NETIF_F_NO_CSUM | NETIF_F_HW_CSUM)
-
#define NETIF_F_V4_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IP_CSUM)
-
#define NETIF_F_V6_CSUM (NETIF_F_GEN_CSUM | NETIF_F_IPV6_CSUM)
-
#define NETIF_F_ALL_CSUM (NETIF_F_V4_CSUM | NETIF_F_V6_CSUM)
-
-
/*
-
* If one device supports one of these features, then enable them
-
* for all in netdev_increment_features.
-
*/
-
#define NETIF_F_ONE_FOR_ALL (NETIF_F_GSO_SOFTWARE | NETIF_F_GSO_ROBUST | \
-
NETIF_F_SG | NETIF_F_HIGHDMA | \
-
NETIF_F_FRAGLIST)
-
-
/* Interface index. Unique device identifier */
-
int ifindex;
-
int iflink;
-
-
struct net_device_stats stats;
-
-
#ifdef CONFIG_WIRELESS_EXT
-
/* List of functions to handle Wireless Extensions (instead of ioctl).
-
* See <net/iw_handler.h> for details. Jean II */
-
const struct iw_handler_def * wireless_handlers;
-
/* Instance data managed by the core of Wireless Extensions. */
-
struct iw_public_data * wireless_data;
-
#endif
-
/* Management operations */
-
const struct net_device_ops *netdev_ops;
-
const struct ethtool_ops *ethtool_ops;
-
-
/* Hardware header description */
-
const struct header_ops *header_ops;
-
-
unsigned int flags; /* interface flags (a la BSD) */
-
unsigned short gflags;
-
unsigned short priv_flags; /* Like 'flags' but invisible to userspace. */
-
unsigned short padded; /* How much padding added by alloc_netdev() */
-
-
unsigned char operstate; /* RFC2863 operstate */
-
unsigned char link_mode; /* mapping policy to operstate */
-
-
unsigned mtu; /* interface MTU value */
-
unsigned short type; /* interface hardware type */
-
unsigned short hard_header_len; /* hardware hdr length */
-
-
/* extra head- and tailroom the hardware may need, but not in all cases
-
* can this be guaranteed, especially tailroom. Some cases also use
-
* LL_MAX_HEADER instead to allocate the skb.
-
*/
-
unsigned short needed_headroom;
-
unsigned short needed_tailroom;
-
-
struct net_device *master; /* Pointer to master device of a group,
-
* which this device is member of.
-
*/
-
-
/* Interface address info. */
-
unsigned char perm_addr[MAX_ADDR_LEN]; /* permanent hw address */
-
unsigned char addr_len; /* hardware address length */
-
unsigned short dev_id; /* for shared network cards */
-
-
struct netdev_hw_addr_list uc; /* Secondary unicast
-
mac addresses */
-
int uc_promisc;
-
spinlock_t addr_list_lock;
-
struct dev_addr_list *mc_list; /* Multicast mac addresses */
-
int mc_count; /* Number of installed mcasts */
-
unsigned int promiscuity;
-
unsigned int allmulti;
-
-
-
/* Protocol specific pointers */
-
-
#ifdef CONFIG_NET_DSA
-
void *dsa_ptr; /* dsa specific data */
-
#endif
-
void *atalk_ptr; /* AppleTalk link */
-
void *ip_ptr; /* IPv4 specific data */
-
void *dn_ptr; /* DECnet specific data */
-
void *ip6_ptr; /* IPv6 specific data */
-
void *ec_ptr; /* Econet specific data */
-
void *ax25_ptr; /* AX.25 specific data */
-
struct wireless_dev *ieee80211_ptr; /* IEEE 802.11 specific data,
-
assign before registering */
-
-
/*
-
* Cache line mostly used on receive path (including eth_type_trans())
-
*/
-
unsigned long last_rx; /* Time of last Rx */
-
/* Interface address info used in eth_type_trans() */
-
unsigned char *dev_addr; /* hw address, (before bcast
-
because most packets are
-
unicast) */
-
-
struct netdev_hw_addr_list dev_addrs; /* list of device
-
hw addresses */
-
-
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
-
-
struct netdev_queue rx_queue;
-
-
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
-
-
/* Number of TX queues allocated at alloc_netdev_mq() time */
-
unsigned int num_tx_queues;
-
-
/* Number of TX queues currently active in device */
-
unsigned int real_num_tx_queues;
-
-
/* root qdisc from userspace point of view */
-
struct Qdisc *qdisc;
-
-
unsigned long tx_queue_len; /* Max frames per queue allowed */
-
spinlock_t tx_global_lock;
-
/*
-
* One part is mostly used on xmit path (device)
-
*/
-
/* These may be needed for future network-power-down code. */
-
-
/*
-
* trans_start here is expensive for high speed devices on SMP,
-
* please use netdev_queue->trans_start instead.
-
*/
-
unsigned long trans_start; /* Time (in jiffies) of last Tx */
-
-
int watchdog_timeo; /* used by dev_watchdog() */
-
struct timer_list watchdog_timer;
-
-
/* Number of references to this device */
-
atomic_t refcnt ____cacheline_aligned_in_smp;
-
-
/* delayed register/unregister */
-
struct list_head todo_list;
-
/* device index hash chain */
-
struct hlist_node index_hlist;
-
-
struct net_device *link_watch_next;
-
-
/* register/unregister state machine */
-
enum { NETREG_UNINITIALIZED=0,
-
NETREG_REGISTERED, /* completed register_netdevice */
-
NETREG_UNREGISTERING, /* called unregister_netdevice */
-
NETREG_UNREGISTERED, /* completed unregister todo */
-
NETREG_RELEASED, /* called free_netdev */
-
NETREG_DUMMY, /* dummy device for NAPI poll */
-
} reg_state;
-
-
/* Called from unregister, can be used to call free_netdev */
-
void (*destructor)(struct net_device *dev);
-
-
#ifdef CONFIG_NETPOLL
-
struct netpoll_info *npinfo;
-
#endif
-
-
#ifdef CONFIG_NET_NS
-
/* Network namespace this network device is inside */
-
struct net *nd_net;
-
#endif
-
-
/* mid-layer private */
-
void *ml_priv;
-
-
/* bridge stuff */
-
struct net_bridge_port *br_port;
-
/* macvlan */
-
struct macvlan_port *macvlan_port;
-
/* GARP */
-
struct garp_port *garp_port;
-
-
/* class/net/name entry */
-
struct device dev;
-
/* space for optional statistics and wireless sysfs groups */
-
const struct attribute_group *sysfs_groups[3];
-
-
/* rtnetlink link ops */
-
const struct rtnl_link_ops *rtnl_link_ops;
-
-
/* VLAN feature mask */
-
unsigned long vlan_features;
-
-
/* for setting kernel sock attribute on TCP connection setup */
-
#define GSO_MAX_SIZE 65536
-
unsigned int gso_max_size;
-
-
#ifdef CONFIG_DCB
-
/* Data Center Bridging netlink ops */
-
struct dcbnl_rtnl_ops *dcbnl_ops;
-
#endif
-
-
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
-
/* max exchange id for FCoE LRO by ddp */
-
unsigned int fcoe_ddp_xid;
-
#endif
-
}
对于里面结构体的说明,请看《深入理解linux网络内幕》第二章,关键数据结构.
对于驱动的初始化操作这里不在说明,可以参考drivers/net下.
我们注册的设备,实际上是添加到了dev_base链表,可以通过dev_get_by_name和dev_get_by_index查询.
对于设备的状态,包括它的状态机一直没弄明白,觉的很神秘,这里就来看看.
设备状态:
Net_device中
Flags用于存储各种表示的位域。多数标示代表设备的能力。然而,其中之一的IFF_UP是用于指出该设备是开启或关闭,可以在include/linux/if.h中找到IFF_XX
这里实际例子就是dev_queue_xmit函数里
Reg_state 设备注册状态
在
界于netreg_uninitalized和netreg_registered之间。
由netdev_run_todo处理。
-
/* register/unregister state machine */
-
enum { NETREG_UNINITIALIZED=0,
-
NETREG_REGISTERED, /* completed register_netdevice */
-
NETREG_UNREGISTERING, /* called unregister_netdevice */
-
NETREG_UNREGISTERED, /* completed unregister todo */
-
NETREG_RELEASED, /* called free_netdev */
-
NETREG_DUMMY, /* dummy device for NAPI poll */
-
} reg_state;
State和其队列规则有关的设备状态
-
/* These flag bits are private to the generic network queueing
-
* layer, they may not be explicitly referenced by any other
-
* code.
-
*/
-
-
enum netdev_state_t
-
{
-
__LINK_STATE_START, // 设备开启 由 netif_running检查
-
__LINK_STATE_PRESENT, //设备存在 netif_device_present 挂起到恢复继续时 需要操作
-
__LINK_STATE_NOCARRIER, // 没载波 netif_carrior_ok 检查
-
__LINK_STATE_LINKWATCH_PENDING,
-
__LINK_STATE_DORMANT,
-
};
实际操作:
-
/*
-
* Default initial state at registry is that the
-
* device is present.
-
*/
-
-
set_bit(__LINK_STATE_PRESENT, &dev->state);
队列规则状态:(个人对这一块比较感兴趣,因为涉及qos等)
每个设备都会被分配一种队列规则,流量控制以此实现其qos机制。
队列规则即qos 是在register_netdevice中初始化的
由函数dev_init_scheduler(dev);来处理.
-
void dev_init_scheduler(struct net_device *dev)
-
{
-
dev->qdisc = &noop_qdisc;
-
netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc);
-
dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc);
-
-
setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev);
-
}
它最后初始化了dev看门狗.
而关于noop_qdisc. 我们在看netif_recevice_skb时会看到
-
#ifdef CONFIG_NET_CLS_ACT
-
skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
-
if (!skb)
-
goto out;
-
ncls:
-
#endif
-
#ifdef CONFIG_NET_CLS_ACT
-
/* TODO: Maybe we should just force sch_ingress to be compiled in
-
* when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
-
* a compare and 2 stores extra right now if we dont have it on
-
* but have CONFIG_NET_CLS_ACT
-
* NOTE: This doesnt stop any functionality; if you dont have
-
* the ingress scheduler, you just cant add policies on ingress.
-
*
-
*/
-
static int ing_filter(struct sk_buff *skb)
-
{
-
struct net_device *dev = skb->dev;
-
u32 ttl = G_TC_RTTL(skb->tc_verd);
-
struct netdev_queue *rxq;
-
int result = TC_ACT_OK;
-
struct Qdisc *q;
-
-
if (MAX_RED_LOOP < ttl++) {
-
printk(KERN_WARNING
-
"Redir loop detected Dropping packet (%d->%d)\n",
-
skb->iif, dev->ifindex);
-
return TC_ACT_SHOT;
-
}
-
-
skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
-
skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
-
-
rxq = &dev->rx_queue;
-
-
q = rxq->qdisc;
-
if (q != &noop_qdisc) {
-
spin_lock(qdisc_lock(q));
-
if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
-
result = qdisc_enqueue_root(skb, q);
-
spin_unlock(qdisc_lock(q));
-
}
-
-
return result;
-
}
-
-
static inline struct sk_buff *handle_ing(struct sk_buff *skb,
-
struct packet_type **pt_prev,
-
int *ret, struct net_device *orig_dev)
-
{
-
if (skb->dev->rx_queue.qdisc == &noop_qdisc)
-
goto out;
-
-
if (*pt_prev) {
-
*ret = deliver_skb(skb, *pt_prev, orig_dev);
-
*pt_prev = NULL;
-
} else {
-
/* Huh? Why does turning on AF_PACKET affect this? */
-
skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
-
}
-
-
switch (ing_filter(skb)) {
-
case TC_ACT_SHOT:
-
case TC_ACT_STOLEN:
-
kfree_skb(skb);
-
return NULL;
-
}
-
-
out:
-
skb->tc_verd = 0;
-
return skb;
-
}
-
#endif
我们回到函数dev_init_scheduler,它调用
-
static void dev_init_scheduler_queue(struct net_device *dev,
-
struct netdev_queue *dev_queue,
-
void *_qdisc)
-
{
-
struct Qdisc *qdisc = _qdisc;
-
-
dev_queue->qdisc = qdisc;
-
dev_queue->qdisc_sleeping = qdisc;
-
}
对收发队列进行初始化操作. 设备注册后,在dev_open时会由dev_activate(dev);来激活队列.
而队列的最开始初始化是在net/sched/sch_api.c中
-
static int __init pktsched_init(void)
-
{
-
register_qdisc(&pfifo_qdisc_ops);
-
register_qdisc(&bfifo_qdisc_ops);
-
register_qdisc(&mq_qdisc_ops);
-
proc_net_fops_create(&init_net, "psched", 0, &psched_fops);
-
-
rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL);
-
rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL);
-
rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc);
-
rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL);
-
rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL);
-
rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass);
-
-
return 0;
-
}
-
-
subsys_initcall(pktsched_init);
这里只是一个小小的开端,准备以后仔细研究下qdisc ^^
前面我们看到一个特殊的函数 netdev_run_todo,其实这个函数在设备注册和注销的时候都会用到.
rtnl_unlock时 会调用netdev_run_todo来执行任务.
至于注册中的rtNetlink通知链这里不说!
设备引用计数 Dev->refcnt 初始化值为1 .
操作函数:dev_put 、dev_hold
很多时候,我们会需要动态的注销一些设备,这个时候在注销的时候 会发送通知让其他引用的子系统释放掉引用.
那么就需要netdev_run_todo 定时调用netdev_wait_allrefs来检查.它会主动发送NETREG_UNREGISTERED通知信息给netdev_chain,直到引用为0.
开启和关闭网络设备需要具体做那些工作以及他们之间的顺序?
1. 调用dev->open
2. 设置dev->state :__LINK_STATE_START
3. 设置dev->flags中IFF_UP
4. 调用dev_activate初始化由流量控制使用的出口队列规则,然后启动watchdog
5. 传送NETDEV_UP通知链netdev_chain ,告知其他内核组件做出反应
关闭的流程和open相反
当然这里和注册和注销所做是不同的事情。
设备的电源管理:挂起
和恢复
涉及到netif_device_detach的作用
挂起的时候。
恢复的时候 用 netif_device_attach
对网络设备操作的常用工具:
ifconfig、ethtool、iproute2 /mii-tools
ifconfig 工作原理?
Socket
ioctl ----> Dev_ioctl
虚拟设备
当然这篇文章很琐碎很杂,不过大部分设备初始化或者注册什么都说到了. 这里写它,只是为了使心中模糊的东西,更加清晰化.
阅读(5012) | 评论(0) | 转发(1) |