这里所说的复杂设备驱动涉及到PCI、USB、网络设备、块设备等(严格意义而言,这些设备在概念上并不并列,例如与块设备并列的是字符设备,而PCI、USB设备等都可能属于字符设备),这些设备的驱动中又涉及到一些与特定设备类型相关的较为复杂的数据结构和程序结构。本文将不对这些设备驱动的细节进行过多的介绍,仅仅进行轻描淡写的叙述。
PCI 是The Peripheral Component Interconnect –Bus的缩写,CPU使用PCI桥chipset与PCI设备通信,PCI桥chipset处理了PCI子系统与内存子系统间的所有数据交互,PCI设备完全被从内存子系统分离出来。下图呈现了PCI子系统的原理:
每个PCI设备都有一个256字节的设备配置块,其中前64字节作为设备的ID和基本配置信息,Linux中提供了一组函数来处理PCI配置块。在PCI设备能得以使用前,Linux驱动程序需要从PCI设备配置块中的信息决定设备的特定参数,进行相关设置以便能正确操作该PCI设备。
一般的PCI设备初始化函数处理流程为:
(1)检查内核是否支持PCI-Bios;
(2)检查设备是否存在,获得设备的配置信息;
1~2这两步的例子如下:
int pcidata_read_proc(char *buf, char **start, off_t offset, int len, int *eof,
void *data)
{
int i, pos = 0;
int bus, devfn;
if (!pcibios_present())
return sprintf(buf, "No PCI bios present\n");
/*
* This code is derived from "drivers/pci/pci.c". This means that
* the GPL applies to this source file and credit is due to the
* original authors (Drew Eckhardt, Frederic Potter, David
* Mosberger-Tang)
*/
for (bus = 0; !bus; bus++)
{
/* only bus 0 :-) */
for (devfn = 0; devfn < 0x100 && pos < PAGE_SIZE / 2; devfn++)
{
struct pci_dev *dev = NULL;
dev = pci_find_slot(bus, devfn);
if (!dev)
continue;
/* Ok, we've found a device, copy its cfg space to the buffer*/
for (i = 0; i < 256; i += sizeof(u32), pos += sizeof(u32))
pci_read_config_dword(dev, i, (u32*)(buf + pos));
pci_release_device(dev); /* 2.0 compatibility */
}
}
*eof = 1;
return pos;
}
其中使用的pci_find_slot()函数定义为:
struct pci_dev *pci_find_slot (unsigned int bus,
unsigned int devfn)
{
struct pci_dev *pptr = kmalloc(sizeof(*pptr), GFP_KERNEL);
int index = 0;
unsigned short vendor;
int ret;
if (!pptr) return NULL;
pptr->index = index; /* 0 */
ret = pcibios_read_config_word(bus, devfn, PCI_VENDOR_ID, &vendor);
if (ret /* == PCIBIOS_DEVICE_NOT_FOUND or whatever error */
|| vendor==0xffff || vendor==0x0000) {
kfree(pptr); return NULL;
}
printk("ok (%i, %i %x)\n", bus, devfn, vendor);
/* fill other fields */
pptr->bus = bus;
pptr->devfn = devfn;
pcibios_read_config_word(pptr->bus, pptr->devfn,
PCI_VENDOR_ID, &pptr->vendor);
pcibios_read_config_word(pptr->bus, pptr->devfn,
PCI_DEVICE_ID, &pptr->device);
return pptr;
}
(3)根据设备的配置信息申请I/O空间及IRQ资源;
(4)注册设备。
USB设备的驱动主要处理probe(探测)、disconnect(断开)函数及usb_device_id(设备信息)数据结构,如:
static struct usb_device_id sample_id_table[] =
{
{
USB_INTERFACE_INFO(3, 1, 1), driver_info: (unsigned long)"keyboard"
} ,
{
USB_INTERFACE_INFO(3, 1, 2), driver_info: (unsigned long)"mouse"
}
,
{
0, /* no more matches */
}
};
static struct usb_driver sample_usb_driver =
{
name: "sample", probe: sample_probe, disconnect: sample_disconnect, id_table:
sample_id_table,
};
当一个USB 设备从系统拔掉后,设备驱动程序的disconnect 函数会自动被调用,在执行了disconnect 函数后,所有为USB 设备分配的数据结构,内存空间都会被释放:
static void sample_disconnect(struct usb_device *udev, void *clientdata)
{
/* the clientdata is the sample_device we passed originally */
struct sample_device *sample = clientdata;
/* remove the URB, remove the input device, free memory */
usb_unlink_urb(&sample->urb);
kfree(sample);
printk(KERN_INFO "sample: USB %s disconnected\n", sample->name);
/*
* here you might MOD_DEC_USE_COUNT, but only if you increment
* the count in sample_probe() below
*/
return;
}
当驱动程序向子系统注册后,插入一个新的USB设备后总是要自动进入probe函数。驱动程序会为这个新加入系统的设备向内部的数据结构建立一个新的实例。通常情况下,probe 函数执行一些功能来检测新加入的USB 设备硬件中的生产厂商和产品定义以及设备所属的类或子类定义是否与驱动程序相符,若相符,再比较接口的数目与本驱动程序支持设备的接口数目是否相符。一般在probe 函数中也会解析USB 设备的说明,从而确认新加入的USB 设备会使用这个驱动程序:
static void *sample_probe(struct usb_device *udev, unsigned int ifnum,
const struct usb_device_id *id)
{
/*
* The probe procedure is pretty standard. Device matching has already
* been performed based on the id_table structure (defined later)
*/
struct usb_interface *iface;
struct usb_interface_descriptor *interface;
struct usb_endpoint_descriptor *endpoint;
struct sample_device *sample;
printk(KERN_INFO "usbsample: probe called for %s device\n",
(char *)id->driver_info /* "mouse" or "keyboard" */ );
iface = &udev->actconfig->interface[ifnum];
interface = &iface->altsetting[iface->act_altsetting];
if (interface->bNumEndpoints != 1) return NULL;
endpoint = interface->endpoint + 0;
if (!(endpoint->bEndpointAddress & 0x80)) return NULL;
if ((endpoint->bmAttributes & 3) != 3) return NULL;
usb_set_protocol(udev, interface->bInterfaceNumber, 0);
usb_set_idle(udev, interface->bInterfaceNumber, 0, 0);
/* allocate and zero a new data structure for the new device */
sample = kmalloc(sizeof(struct sample_device), GFP_KERNEL);
if (!sample) return NULL; /* failure */
memset(sample, 0, sizeof(*sample));
sample->name = (char *)id->driver_info;
/* fill the URB data structure using the FILL_INT_URB macro */
{
int pipe = usb_rcvintpipe(udev, endpoint->bEndpointAddress);
int maxp = usb_maxpacket(udev, pipe, usb_pipeout(pipe));
if (maxp > 8) maxp = 8; sample->maxp = maxp; /* remember for later */
FILL_INT_URB(&sample->urb, udev, pipe, sample->data, maxp,
sample_irq, sample, endpoint->bInterval);
}
/* register the URB within the USB subsystem */
if (usb_submit_urb(&sample->urb)) {
kfree(sample);
return NULL;
}
/* announce yourself */
printk(KERN_INFO "usbsample: probe successful for %s (maxp is %i)\n",
sample->name, sample->maxp);
/*
* here you might MOD_INC_USE_COUNT; if you do, you'll need to unplug
* the device or the devices before being able to unload the module
*/
/* and return the new structure */
return sample;
}
在网络设备驱动的编写中,我们特别关心的就是数据的收、发及中断。网络设备驱动程序的层次如下:
网络设备接收到报文后将其传入上层:
/*
* Receive a packet: retrieve, encapsulate and pass over to upper levels
*/
void snull_rx(struct net_device *dev, int len, unsigned char *buf)
{
struct sk_buff *skb;
struct snull_priv *priv = (struct snull_priv *) dev->priv;
/*
* The packet has been retrieved from the transmission
* medium. Build an skb around it, so upper layers can handle it
*/
skb = dev_alloc_skb(len+2);
if (!skb) {
printk("snull rx: low on mem - packet dropped\n");
priv->stats.rx_dropped++;
return;
}
skb_reserve(skb, 2); /* align IP on 16B boundary */
memcpy(skb_put(skb, len), buf, len);
/* Write metadata, and then pass to the receive level */
skb->dev = dev;
skb->protocol = eth_type_trans(skb, dev);
skb->ip_summed = CHECKSUM_UNNECESSARY; /* don't check it */
priv->stats.rx_packets++;
#ifndef LINUX_20
priv->stats.rx_bytes += len;
#endif
netif_rx(skb);
return;
}
在中断到来时接收报文信息:
void snull_interrupt(int irq, void *dev_id, struct pt_regs *regs)
{
int statusword;
struct snull_priv *priv;
/*
* As usual, check the "device" pointer for shared handlers.
* Then assign "struct device *dev"
*/
struct net_device *dev = (struct net_device *)dev_id;
/* ... and check with hw if it's really ours */
if (!dev /*paranoid*/ ) return;
/* Lock the device */
priv = (struct snull_priv *) dev->priv;
spin_lock(&priv->lock);
/* retrieve statusword: real netdevices use I/O instructions */
statusword = priv->status;
if (statusword & SNULL_RX_INTR) {
/* send it to snull_rx for handling */
snull_rx(dev, priv->rx_packetlen, priv->rx_packetdata);
}
if (statusword & SNULL_TX_INTR) {
/* a transmission is over: free the skb */
priv->stats.tx_packets++;
priv->stats.tx_bytes += priv->tx_packetlen;
dev_kfree_skb(priv->skb);
}
/* Unlock the device and we are done */
spin_unlock(&priv->lock);
return;
}
而发送报文则分为两个层次,一个层次是内核调用,一个层次完成真正的硬件上的发送:
/*
* Transmit a packet (called by the kernel)
*/
int snull_tx(struct sk_buff *skb, struct net_device *dev)
{
int len;
char *data;
struct snull_priv *priv = (struct snull_priv *) dev->priv;
#ifndef LINUX_24
if (dev->tbusy || skb == NULL) {
PDEBUG("tint for %p, tbusy %ld, skb %p\n", dev, dev->tbusy, skb);
snull_tx_timeout (dev);
if (skb == NULL)
return 0;
}
#endif
len = skb->len < ETH_ZLEN ? ETH_ZLEN : skb->len;
data = skb->data;
dev->trans_start = jiffies; /* save the timestamp */
/* Remember the skb, so we can free it at interrupt time */
priv->skb = skb;
/* actual deliver of data is device-specific, and not shown here */
snull_hw_tx(data, len, dev);
return 0; /* Our simple device can not fail */
}
/*
* Transmit a packet (low level interface)
*/
void snull_hw_tx(char *buf, int len, struct net_device *dev)
{
/*
* This function deals with hw details. This interface loops
* back the packet to the other snull interface (if any).
* In other words, this function implements the snull behaviour,
* while all other procedures are rather device-independent
*/
struct iphdr *ih;
struct net_device *dest;
struct snull_priv *priv;
u32 *saddr, *daddr;
/* I am paranoid. Ain't I? */
if (len < sizeof(struct ethhdr) + sizeof(struct iphdr)) {
printk("snull: Hmm... packet too short (%i octets)\n",
len);
return;
}
if (0) { /* enable this conditional to look at the data */
int i;
PDEBUG("len is %i\n" KERN_DEBUG "data:",len);
for (i=14 ; i printk(" %02x",buf[i]&0xff);
printk("\n");
}
/*
* Ethhdr is 14 bytes, but the kernel arranges for iphdr
* to be aligned (i.e., ethhdr is unaligned)
*/
ih = (struct iphdr *)(buf+sizeof(struct ethhdr));
saddr = &ih->saddr;
daddr = &ih->daddr;
((u8 *)saddr)[2] ^= 1; /* change the third octet (class C) */
((u8 *)daddr)[2] ^= 1;
ih->check = 0; /* and rebuild the checksum (ip needs it) */
ih->check = ip_fast_csum((unsigned char *)ih,ih->ihl);
if (dev == snull_devs)
PDEBUGG("%08x:%05i --> %08x:%05i\n",
ntohl(ih->saddr),ntohs(((struct tcphdr *)(ih+1))->source),
ntohl(ih->daddr),ntohs(((struct tcphdr *)(ih+1))->dest));
else
PDEBUGG("%08x:%05i <-- %08x:%05i\n",
ntohl(ih->daddr),ntohs(((struct tcphdr *)(ih+1))->dest),
ntohl(ih->saddr),ntohs(((struct tcphdr *)(ih+1))->source));
/*
* Ok, now the packet is ready for transmission: first simulate a
* receive interrupt on the twin device, then a
* transmission-done on the transmitting device
*/
dest = snull_devs + (dev==snull_devs ? 1 : 0);
priv = (struct snull_priv *) dest->priv;
priv->status = SNULL_RX_INTR;
priv->rx_packetlen = len;
priv->rx_packetdata = buf;
snull_interrupt(0, dest, NULL);
priv = (struct snull_priv *) dev->priv;
priv->status = SNULL_TX_INTR;
priv->tx_packetlen = len;
priv->tx_packetdata = buf;
if (lockup && ((priv->stats.tx_packets + 1) % lockup) == 0) {
/* Simulate a dropped transmit interrupt */
netif_stop_queue(dev);
PDEBUG("Simulate lockup at %ld, txp %ld\n", jiffies,
(unsigned long) priv->stats.tx_packets);
}
else
snull_interrupt(0, dev, NULL);
}
块设备也以与字符设备register_chrdev、unregister_ chrdev 函数类似的方法进行设备的注册与释放。但是,register_chrdev使用一个向 file_operations 结构的指针,而register_blkdev 则使用 block_device_operations 结构的指针,其中定义的open、release 和 ioctl 方法和字符设备的对应方法相同,但未定义 read 或者 write 操作。这是因为,所有涉及到块设备的 I/O 通常由系统进行缓冲处理。
块驱动程序最终必须提供完成实际块 I/O 操作的机制,在 Linux中,用于这些 I/O 操作的方法称为“request(请求)”。在块设备的注册过程中,需要初始化request队列,这一动作通过blk_init_queue来完成,blk_init_queue函数建立队列,并将该驱动程序的 request 函数关联到队列。在模块的清除阶段,应调用 blk_cleanup_queue 函数。看看mtdblock的例子:
static void handle_mtdblock_request(void)
{
struct request *req;
struct mtdblk_dev *mtdblk;
unsigned int res;
for (;;) {
INIT_REQUEST;
req = CURRENT;
spin_unlock_irq(QUEUE_LOCK(QUEUE));
mtdblk = mtdblks[minor(req->rq_dev)];
res = 0;
if (minor(req->rq_dev) >= MAX_MTD_DEVICES)
panic("%s : minor out of bound", __FUNCTION__);
if (!IS_REQ_CMD(req))
goto end_req;
if ((req->sector + req->current_nr_sectors) > (mtdblk->mtd->size >> 9))
goto end_req;
// Handle the request
switch (rq_data_dir(req))
{
int err;
case READ:
down(&mtdblk->cache_sem);
err = do_cached_read (mtdblk, req->sector << 9,
req->current_nr_sectors << 9,
req->buffer);
up(&mtdblk->cache_sem);
if (!err)
res = 1;
break;
case WRITE:
// Read only device
if ( !(mtdblk->mtd->flags & MTD_WRITEABLE) )
break;
// Do the write
down(&mtdblk->cache_sem);
err = do_cached_write (mtdblk, req->sector << 9,
req->current_nr_sectors << 9,
req->buffer);
up(&mtdblk->cache_sem);
if (!err)
res = 1;
break;
}
end_req:
spin_lock_irq(QUEUE_LOCK(QUEUE));
end_request(res);
}
}
int __init init_mtdblock(void)
{
int i;
spin_lock_init(&mtdblks_lock);
/* this lock is used just in kernels >= 2.5.x */
spin_lock_init(&mtdblock_lock);
#ifdef CONFIG_DEVFS_FS
if (devfs_register_blkdev(MTD_BLOCK_MAJOR, DEVICE_NAME, &mtd_fops))
{
printk(KERN_NOTICE "Can't allocate major number %d for Memory Technology Devices.\n",
MTD_BLOCK_MAJOR);
return -EAGAIN;
}
devfs_dir_handle = devfs_mk_dir(NULL, DEVICE_NAME, NULL);
register_mtd_user(¬ifier);
#else
if (register_blkdev(MAJOR_NR,DEVICE_NAME,&mtd_fops)) {
printk(KERN_NOTICE "Can't allocate major number %d for Memory Technology Devices.\n",
MTD_BLOCK_MAJOR);
return -EAGAIN;
}
#endif
/* We fill it in at open() time. */
for (i=0; i< MAX_MTD_DEVICES; i++) {
mtd_sizes[i] = 0;
mtd_blksizes[i] = BLOCK_SIZE;
}
init_waitqueue_head(&thr_wq);
/* Allow the block size to default to BLOCK_SIZE. */
blksize_size[MAJOR_NR] = mtd_blksizes;
blk_size[MAJOR_NR] = mtd_sizes;
BLK_INIT_QUEUE(BLK_DEFAULT_QUEUE(MAJOR_NR), &mtdblock_request, &mtdblock_lock);
kernel_thread (mtdblock_thread, NULL, CLONE_FS|CLONE_FILES|CLONE_SIGHAND);
return 0;
}
static void __exit cleanup_mtdblock(void)
{
leaving = 1;
wake_up(&thr_wq);
down(&thread_sem);
#ifdef CONFIG_DEVFS_FS
unregister_mtd_user(¬ifier);
devfs_unregister(devfs_dir_handle);
devfs_unregister_blkdev(MTD_BLOCK_MAJOR, DEVICE_NAME);
#else
unregister_blkdev(MAJOR_NR,DEVICE_NAME);
#endif
blk_cleanup_queue(BLK_DEFAULT_QUEUE(MAJOR_NR));
blksize_size[MAJOR_NR] = NULL;
blk_size[MAJOR_NR] = NULL;
}