分类:
2012-11-28 09:08:08
原文地址:深入理解Linux网络技术内幕-PCI层和网络接口卡 作者:visualfan
内核中的PCI子系统提供各种PCI设备驱动程序共同的所有通用功能,这个子系统让程序员减少了很多对每种设备所做的事,让驱动程序编写更加简单,而内核收集和维护各种设备的信息也更容易。
数据结构
PCI层使用的一些关键数据结构类型,一般定义在include/linux/mod_devicetable.h和include/linux/pci.h中:
pci_device_id:设备标识符,这是根据PCI标准所定义的ID;
struct pci_device_id {
__u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/
__u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
__u32 class, class_mask; /* (class,subclass,prog-if) triplet */
kernel_ulong_t driver_data; /* Data private to the driver */
};
pci_dev:每个PCI设备都会被分派一个pci_dev实例,像网络设备都会被分派net_device实例一样。这个结构由内核使用,以引用一个PCI设备。
/*
* The pci_dev structure is used to describe PCI devices.
*/
struct pci_dev {
struct list_head bus_list; /* node in per-bus list */
struct pci_bus *bus; /* bus this device is on */
struct pci_bus *subordinate; /* bus this device bridges to */
void *sysdata; /* hook for sys-specific extension */
struct proc_dir_entry *procent; /* device entry in /proc/bus/pci */
struct pci_slot *slot; /* Physical slot this device is in */
unsigned int devfn; /* encoded device & function index */
unsigned short vendor;
unsigned short device;
unsigned short subsystem_vendor;
unsigned short subsystem_device;
unsigned int class; /* 3 bytes: (base,sub,prog-if) */
u8 revision; /* PCI revision, low byte of class word */
u8 hdr_type; /* PCI header type (`multi' flag masked out) */
u8 pcie_cap; /* PCI-E capability offset */
u8 pcie_type; /* PCI-E device/port type */
u8 rom_base_reg; /* which config register controls the ROM */
u8 pin; /* which interrupt pin this device uses */
struct pci_driver *driver; /* which driver has allocated this device */
u64 dma_mask; /* Mask of the bits of bus address this
device implements. Normally this is
0xffffffff. You only need to change
this if your device has broken DMA
or supports 64-bit transfers. */
struct device_dma_parameters dma_parms;
pci_power_t current_state; /* Current operating state. In ACPI-speak,
this is D0-D3, D0 being fully functional,
and D3 being off. */
int pm_cap; /* PM capability offset in the
configuration space */
unsigned int pme_support:5; /* Bitmask of states from which PME#
can be generated */
unsigned int pme_interrupt:1;
unsigned int d1_support:1; /* Low power state D1 is supported */
unsigned int d2_support:1; /* Low power state D2 is supported */
unsigned int no_d1d2:1; /* Only allow D0 and D3 */
unsigned int mmio_always_on:1; /* disallow turning off io/mem
decoding during bar sizing */
unsigned int wakeup_prepared:1;
unsigned int d3_delay; /* D3->D0 transition time in ms */
#ifdef CONFIG_PCIEASPM
struct pcie_link_state *link_state; /* ASPM link state. */
#endif
pci_channel_state_t error_state; /* current connectivity state */
struct device dev; /* Generic device interface */
int cfg_size; /* Size of configuration space */
/*
* Instead of touching interrupt line and base address registers
* directly, use the values stored here. They might be different!
*/
unsigned int irq;
struct resource resource[DEVICE_COUNT_RESOURCE]; /* I/O and memory regions + expansion ROMs */
resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]; /* FW-assigned addr */
/* These fields are used by common fixups */
unsigned int transparent:1; /* Transparent PCI bridge */
unsigned int multifunction:1;/* Part of multi-function device */
/* keep track of device state */
unsigned int is_added:1;
unsigned int is_busmaster:1; /* device is busmaster */
unsigned int no_msi:1; /* device may not use msi */
unsigned int block_ucfg_access:1; /* userspace config space access is blocked */
unsigned int broken_parity_status:1; /* Device generates false positive parity */
unsigned int irq_reroute_variant:2; /* device needs IRQ rerouting variant */
unsigned int msi_enabled:1;
unsigned int msix_enabled:1;
unsigned int ari_enabled:1; /* ARI forwarding */
unsigned int is_managed:1;
unsigned int is_pcie:1; /* Obsolete. Will be removed.
Use pci_is_pcie() instead */
unsigned int needs_freset:1; /* Dev requires fundamental reset */
unsigned int state_saved:1;
unsigned int is_physfn:1;
unsigned int is_virtfn:1;
unsigned int reset_fn:1;
unsigned int is_hotplug_bridge:1;
unsigned int __aer_firmware_first_valid:1;
unsigned int __aer_firmware_first:1;
pci_dev_flags_t dev_flags;
atomic_t enable_cnt; /* pci_enable_device has been called */
u32 saved_config_space[16]; /* config space saved at suspend time */
struct hlist_head saved_cap_space;
struct bin_attribute *rom_attr; /* attribute descriptor for sysfs ROM entry */
int rom_attr_enabled; /* has display of the rom attribute been enabled? */
struct bin_attribute *res_attr[DEVICE_COUNT_RESOURCE]; /* sysfs file for resources */
struct bin_attribute *res_attr_wc[DEVICE_COUNT_RESOURCE]; /* sysfs file for WC mapping of resources */
#ifdef CONFIG_PCI_MSI
struct list_head msi_list;
#endif
struct pci_vpd *vpd;
#ifdef CONFIG_PCI_IOV
union {
struct pci_sriov *sriov; /* SR-IOV capability related */
struct pci_dev *physfn; /* the PF this VF is associated with */
};
struct pci_ats *ats; /* Address Translation Service */
#endif
};
pci_driver:定义PCI层和设备驱动程序之间的接口,这个结构主要由函数指针组成,所有PCI设备都会使用这个结构。
PCI设备驱动程序都由一个pci_driver结构的实例定义,函数指针由设备驱动程序初始化为该驱动程序内适当的函数。
struct pci_driver {
struct list_head node;
const char *name; //驱动程序名称
//内核用于将一些设备ID关联到这个驱动程序,也就是这个驱动程序可以驱动的PCI设备
const struct pci_device_id *id_table; /* must be non-NULL for probe to be called */
/*当PCI层搜索驱动程序的设备ID和此结构的id_table匹配时,就会调用此函数*/
/*如NIC驱动中,这个函数会开启硬件、分配net_device结构,初始化并注册新设备,还可以分配所需的所有缓冲区和数据结构*/
int (*probe) (struct pci_dev *dev, const struct pci_device_id *id); /* New device inserted */
/*当驱动程序从内核移除,或热插拔设备被移除时,PCI层就会调用这个函数,这个函数是probe函数的逆函数,用于清理环境和资源。网络设备使用此函数来释放分配的I/O端口和I/O内存,注销网络设备,释放net_device结构以及probe函数分配的资源*/
void (*remove) (struct pci_dev *dev); /* Device removed (NULL if not a hot-plug capable driver) */
/*当系统挂起和恢复运行时,会分别调用下面的函数*/
int (*suspend) (struct pci_dev *dev, pm_message_t state); /* Device suspended */
int (*suspend_late) (struct pci_dev *dev, pm_message_t state);
int (*resume_early) (struct pci_dev *dev);
int (*resume) (struct pci_dev *dev); /* Device woken up */
void (*shutdown) (struct pci_dev *dev);
/*错误处理函数*/
struct pci_error_handlers *err_handler;
/*设备驱动程序*/
struct device_driver driver;
/*动态ID*/
struct pci_dynids dynids;
};
PCI NIC设备驱动程序的注册
PCI设备被独一无二地识别是通过pci_device_id这个结构中厂商ID、设备ID和设备类型等一系列参数的组合。
struct pci_device_id {
__u32 vendor, device; /* Vendor and device ID or PCI_ANY_ID*/
__u32 subvendor, subdevice; /* Subsystem ID's or PCI_ANY_ID */
__u32 class, class_mask; /* (class,subclass,prog-if) triplet */
kernel_ulong_t driver_data; /* Data private to the driver */
};
其中通过vendor和device通常就能识别设备。subverndor和subdevice很少用到,这两个字段通常设备为通配符(PCI_ANY_ID).class和class_mask代表该设备所属的类,其中NETWORK就是NIC设备所属的类。driver_data是驱动程序使用的私有参数。
每个PCI设备驱动程序都需要将一个pci_device_id实例向内核注册。
注册函数:pci_register_driver
注销函数:pci_unregister_driver
这两个函数的参数都是一个指向pci_driver实例的指针。通过这个实例中的id_table参数(pci_device_id实例),内核就知道此驱动程序可以驱动哪些PCI设备。通过这个实例中的函数指针,内核可以喝设备进行交互。
PCI的优点就是支持寻找IRQ和每个设备所需的其他资源的探测方式(probing)相当优雅。模块卡伊在加载期间接收一些输入参数,以告知该如何配置其所负责的所有设备。/sys文件系统输出有关系统总线如PCI、USB等的信息,包括各种设备和设备之间的关系。/sys也运行管理员为特定的设备驱动程序定义新的ID,使得除了驱动程序通过pci_driver结构的id_table注册的静态ID外,内核还能使用用户所配置的参数。有两种根据设备ID查询驱动程序的探测机制:
电源管理和网卡唤醒
PCI电源管理由pci_driver结构中的suspend和resume函数处理,除了分别负责PCI状态的保存和恢复之外,在处理NIC驱动时还需完成一些特定功能。suspend需要停止设备出口队列,使该设备不能再发送数据包;resume恢复出口队列,是设备可以重新传输。
网卡唤醒(Wake-on-Lan,WOL)是NIC接收一种特俗类型的帧唤醒处于待命模式的系统的功能。WOL功能默认是关闭的,此功能可以通过pci_enable_wake打开或关闭。
static inline int pci_enable_wake(struct pci_dev *dev, pci_power_t state, bool enable)
{
return __pci_enable_wake(dev, state, false, enable);
}
有多种类型的帧都可以完成唤醒功能,其中ethtool功能允许管理员配置哪几种帧可以唤醒系统。net-utils套件中有一个ether-wake命令可以产生WOL Ethernet帧。
PCI NIC驱动程序注册范例
以Intel PRO/100 Ethernet驱动程序说明NIC设备驱动程序的注册,源文件为drivers/net/e100.c。
初始化pci_device_id内容:
#define INTEL_8255X_ETHERNET_DEVICE(device_id, ich) {\
PCI_VENDOR_ID_INTEL, device_id, PCI_ANY_ID, PCI_ANY_ID, \
PCI_CLASS_NETWORK_ETHERNET << 8, 0xFFFF00, ich }
/****************************************************************************************/
#define DEFINE_PCI_DEVICE_TABLE(_table) \
const struct pci_device_id _table[] __devinitconst
/***************************************************************************************/
static DEFINE_PCI_DEVICE_TABLE(e100_id_table) = {
INTEL_8255X_ETHERNET_DEVICE(0x1029, 0),
INTEL_8255X_ETHERNET_DEVICE(0x1030, 0),
INTEL_8255X_ETHERNET_DEVICE(0x1031, 3),
INTEL_8255X_ETHERNET_DEVICE(0x1032, 3),
INTEL_8255X_ETHERNET_DEVICE(0x1033, 3),
INTEL_8255X_ETHERNET_DEVICE(0x1034, 3),
INTEL_8255X_ETHERNET_DEVICE(0x1038, 3),
INTEL_8255X_ETHERNET_DEVICE(0x1039, 4),
INTEL_8255X_ETHERNET_DEVICE(0x103A, 4),
INTEL_8255X_ETHERNET_DEVICE(0x103B, 4),
INTEL_8255X_ETHERNET_DEVICE(0x103C, 4),
INTEL_8255X_ETHERNET_DEVICE(0x103D, 4),
INTEL_8255X_ETHERNET_DEVICE(0x103E, 4),
INTEL_8255X_ETHERNET_DEVICE(0x1050, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1051, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1052, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1053, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1054, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1055, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1056, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1057, 5),
INTEL_8255X_ETHERNET_DEVICE(0x1059, 0),
INTEL_8255X_ETHERNET_DEVICE(0x1064, 6),
INTEL_8255X_ETHERNET_DEVICE(0x1065, 6),
INTEL_8255X_ETHERNET_DEVICE(0x1066, 6),
INTEL_8255X_ETHERNET_DEVICE(0x1067, 6),
INTEL_8255X_ETHERNET_DEVICE(0x1068, 6),
INTEL_8255X_ETHERNET_DEVICE(0x1069, 6),
INTEL_8255X_ETHERNET_DEVICE(0x106A, 6),
INTEL_8255X_ETHERNET_DEVICE(0x106B, 6),
INTEL_8255X_ETHERNET_DEVICE(0x1091, 7),
INTEL_8255X_ETHERNET_DEVICE(0x1092, 7),
INTEL_8255X_ETHERNET_DEVICE(0x1093, 7),
INTEL_8255X_ETHERNET_DEVICE(0x1094, 7),
INTEL_8255X_ETHERNET_DEVICE(0x1095, 7),
INTEL_8255X_ETHERNET_DEVICE(0x10fe, 7),
INTEL_8255X_ETHERNET_DEVICE(0x1209, 0),
INTEL_8255X_ETHERNET_DEVICE(0x1229, 0),
INTEL_8255X_ETHERNET_DEVICE(0x2449, 2),
INTEL_8255X_ETHERNET_DEVICE(0x2459, 2),
INTEL_8255X_ETHERNET_DEVICE(0x245D, 2),
INTEL_8255X_ETHERNET_DEVICE(0x27DC, 7),
{ 0, }
};
在模块的初始化和卸载接口中完成PCI设备驱动程序的注册和注销:
static struct pci_driver e100_driver = {
.name = DRV_NAME,
.id_table = e100_id_table,
.probe = e100_probe,
.remove = __devexit_p(e100_remove),
#ifdef CONFIG_PM
/* Power Management hooks */
.suspend = e100_suspend,
.resume = e100_resume,
#endif
.shutdown = e100_shutdown,
.err_handler = &e100_err_handler,
};
static int __init e100_init_module(void)
{
if (((1 << debug) - 1) & NETIF_MSG_DRV) {
pr_info("%s, %s\n", DRV_DESCRIPTION, DRV_VERSION);
pr_info("%s\n", DRV_COPYRIGHT);
}
return pci_register_driver(&e100_driver);
}
static void __exit e100_cleanup_module(void)
{
pci_unregister_driver(&e100_driver);
}
module_init(e100_init_module);
module_exit(e100_cleanup_module);
其中的一些函数指针原型:
#define DRV_NAME "e100"
static int __devinit e100_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
{
struct net_device *netdev;
struct nic *nic;
int err;
if (!(netdev = alloc_etherdev(sizeof(struct nic)))) {
if (((1 << debug) - 1) & NETIF_MSG_PROBE)
pr_err("Etherdev alloc failed, aborting\n");
return -ENOMEM;
}
……
……
}
PCI子系统总览
(a)在系统引导时,会建立一个数据库,把每个总线都关联到一份已侦测到而使用该总线的设备列表。PCI总线的描述符处理其他参数外,还包括一个已侦测PCI设备的列表。
(b)当驱动程序被加载,调用pci_register_driver注册pci_driver到PCI层时,PCI会使用pci_driver结构中的PCI设备ID参数id_table与已侦测到的PCI设备列表匹配,若匹配到就会建立该驱动程序的设备列表。对于每个匹配到的设备,PCI层会调用相匹配的驱动程序中的pci_driver结构中的probe函数,建立并注册相关联的网络设备。
/proc/pci文件包含了已注册的PCI设备的信息。pciutils套件中的lspci命令会输出有关本地PCI设备的信息,其中有些信息取自/sys。