Linux内核cache初探-ninver-ChinaUnix博客

my blog

首页　| 　博文目录　| 　关于我

ninver

博客访问： 154739
博文数量： 54
博客积分： 2517
博客等级：少校
技术积分： 540
用户组：普通用户
注册时间： 2009-09-13 18:52

文章分类

全部博文（54）

未分配的博文（54）

文章存档

2011年（2）

2010年（11）

2009年（41）

我的朋友

zhiqiang

相关博文

Linux内核cache初探

分类： LINUX

2009-10-27 11:46:18

Linux内核里cache分成page cache 和buffer cache。

Page cache用作所有的页，Buffer cache用于块设备

Buffer cache由链表组成经常用到的cache 数据被放到链表头

效率和性能是对内核来说是很重要的两个方面，buffering和caching使用一部分内存确保在内存中操作经常使用的以及很重要的块设备数据

但是每次数据改变了并不马上回写到块设备在一段时间后才写到设备中去这个时间间隔取决于很多种因素比如内存性能，内存数据操作的频繁次数。一次写请求只需花费少量时间，因此延迟的写操作总的来说改善了系统的性能

尽管如此，cache在内核中应当谨慎使用

1 内存大大小于块设备的容量

2 内存用作cache ，是的应用程序可用的内存变小

3 如果系统崩溃内存里的数据没来得及写到硬盘就丢了

Caching和swap是两个相反的操作，因为高速内存被用作cache替代低速的设备上的swap

Linux内核有两种cache可供块设备使用

1 page cache专为页为单位操作的

2buffer cache 用作块

问题是如何在Linux内核里的page cache快速找到需要的页。Linux内核用的是一种叫做radix tree的树来管理page cache里的页

得益于page cache写操作不用直接操作下层的块设备只需操作内存就行了，数据随后被写到low level设备里，这样设备的性能得到很大的利用，这里我们就有一个问题了什么时候把数据回写到设备合适，当然取决于具体环境和应用

Linux内核提供了几种方法可供选择：

1 守护进程pdflush在后台运行并且被周期的激活，查看页里的cahce并把没有同步的数据写回到下层的块设备中去

2第二种办法是pdflush被内核激活如果在短时间内cache里的大量数据被更新

3 对用户和应用程序来说系统调用是个不错的选择比如：sync

为了管理cache里各种各样的数据结构内核用抽象的address space 表示具体的块设备在内存中的页

最初我们只关注一个方面，就是address space结构体里有个inode 的host 的块设备对象，在多数情况下很多inode只表示一个文件（设备文件），因为page cache源于文件访问，inode 表示虚拟的块设备文件系统，因此address space表示设备而非一个文件，也由于inode存在于设备super block 的inode的链表中，因此我们要遍历super block的inode 链表才能得到page cache。

struct address_space {

struct inode *host; /* owner: inode, block_device */

struct radix_tree_root page_tree; /* radix tree of all pages */

spinlock_t tree_lock; /* and lock protecting it */

unsigned int i_mmap_writable;/* count VM_SHARED mappings */

struct prio_tree_root i_mmap; /* tree of private and shared mappings */

struct list_head i_mmap_nonlinear;/*list VM_NONLINEAR mappings */

spinlock_t i_mmap_lock; /* protect tree, count, list */

unsigned int truncate_count; /* Cover race condition with truncate */

unsigned long nrpages; /* number of total pages */

pgoff_t writeback_index;/* writeback starts here */

const struct address_space_operations *a_ops; /* methods */

unsigned long flags; /* error bits/gfp mask */

struct backing_dev_info *backing_dev_info; /* device readahead, etc */

spinlock_t private_lock; /* for use by the address_space */

struct list_head private_list; /* ditto */

struct address_space *assoc_mapping; /* ditto */

} __attribute__((aligned(sizeof(long))));

struct super_block {

struct list_head s_list; /* Keep this first */

dev_t s_dev; /* search index; _not_ kdev_t */

unsigned long s_blocksize;

unsigned char s_blocksize_bits;

unsigned char s_dirt;

unsigned long long s_maxbytes; /* Max file size */

struct file_system_type *s_type;

const struct super_operations *s_op;

struct dquot_operations *dq_op;

struct quotactl_ops *s_qcop;

const struct export_operations *s_export_op;

unsigned long s_flags;

unsigned long s_magic;

struct dentry *s_root;

struct rw_semaphore s_umount;

struct mutex s_lock;

int s_count;

int s_need_sync_fs;

atomic_t s_active;

#ifdef CONFIG_SECURITY

void *s_security;

#endif

struct xattr_handler **s_xattr;

struct list_head s_inodes; /* all inodes */

struct list_head s_dirty; /* dirty inodes */

struct list_head s_io; /* parked for writeback */

struct list_head s_more_io; /* parked for more writeback */

struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */

struct list_head s_files;

/* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */

struct list_head s_dentry_lru; /* unused dentry lru */

int s_nr_dentry_unused; /* # of dentry on lru */

struct block_device *s_bdev;

struct mtd_info *s_mtd;

struct list_head s_instances;

struct quota_info s_dquot; /* Diskquota specific options */

int s_frozen;

wait_queue_head_t s_wait_unfrozen;

char s_id[32]; /* Informational name */

void *s_fs_info; /* Filesystem private info */

fmode_t s_mode;

* The next field is for VFS *only*. No filesystems have any business

* even looking at it. You had been warned.

struct mutex s_vfs_rename_mutex; /* Kludge */

/* Granularity of c/m/atime in ns.

Cannot be worse than a second */

u32 s_time_gran;

* Filesystem subtype. If non-empty the filesystem type field

* in /proc/mounts will be "type.subtype"

char *s_subtype;

* Saved mount options for lazy filesystems using

* generic_show_options()

char *s_options;

* storage for asynchronous operations

struct list_head s_async_list;

};

struct inode {

struct hlist_node i_hash;

struct list_head i_list;

struct list_head i_sb_list;

struct list_head i_dentry;

unsigned long i_ino;

atomic_t i_count;

unsigned int i_nlink;

uid_t i_uid;

gid_t i_gid;

dev_t i_rdev;

u64 i_version;

loff_t i_size;

#ifdef __NEED_I_SIZE_ORDERED

seqcount_t i_size_seqcount;

#endif

struct timespec i_atime;

struct timespec i_mtime;

struct timespec i_ctime;

unsigned int i_blkbits;

blkcnt_t i_blocks;

unsigned short i_bytes;

umode_t i_mode;

spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */

struct mutex i_mutex;

struct rw_semaphore i_alloc_sem;

const struct inode_operations *i_op;

const struct file_operations *i_fop; /* former ->i_op->default_file_ops */

struct super_block *i_sb;

struct file_lock *i_flock;

struct address_space *i_mapping;

struct address_space i_data;

#ifdef CONFIG_QUOTA

struct dquot *i_dquot[MAXQUOTAS];

#endif

struct list_head i_devices;

union {

struct pipe_inode_info *i_pipe;

struct block_device *i_bdev;

struct cdev *i_cdev;

};

int i_cindex;

__u32 i_generation;

#ifdef CONFIG_DNOTIFY

unsigned long i_dnotify_mask; /* Directory notify events */

struct dnotify_struct *i_dnotify; /* for directory notifications */

#endif

#ifdef CONFIG_INOTIFY

struct list_head inotify_watches; /* watches on this inode */

struct mutex inotify_mutex; /* protects the watches list */

#endif

unsigned long i_state;

unsigned long dirtied_when; /* jiffies of first dirtying */

unsigned int i_flags;

atomic_t i_writecount;

#ifdef CONFIG_SECURITY

void *i_security;

#endif

void *i_private; /* fs or device private pointer */

};

通常修改文件或其他的结构只改动页里的部分内容，同步页没有必要把整个页写回到块设备中去，为了节省时间Linux内核在写操作时把cache里的页分成更小的单位buffers

Buffer cache由两部分组成：1 buffer head 2 数据

应用程序是通过块访问块设备的而非页，就像superblock ，buffer cache操作时是独立的和page cache无关，

buffer heads — the data structure is the same in buffer caches and page caches — are grouped together

in an array of constant size whose individual entries are managed on a least recently used basis.

最多被用到的buffer_head 被放到buffer cache 数组的最前面反之很长时间没用到则被往后移动？因为这种排列所以进入LRU 链表的次数被限定为一个值在Linux内核运行时不能改动，内核不需要再开额外的线程去调整cache的大小为更合理的值。替代方法是内存释放时需要移除相关连的buffer

在内核运行时，我们不止要知道内存页中的一个buffer的cache在哪？还要知道cahce的数据是谁的？然而在早前的Linux 内核中inode只是能得到cache目录的入口数据，现在内核使用很多address space用来在cache数据和要得到的设备数据建立连接，尽管文件目录仍然占据大量的cache里数据，这些接口统一了以便caches锁定找到数据加快访问的速度

Address space刚好放入页cache的机制有两点：

1 页面在主存中已经被分配了地址空间，所以这些页面能被用户进程或内核操作

2 The backing store specifies the sources from which the address space pages are filled. Address

spaces relate to the virtual address space of the processor and are a mapping of the segment

managed by the processor in virtual memory and the corresponding positions on a source

device (using a block device).

为了支持数据传输，地址空间提供了一系列操作比如：读块设备或者文件系统中的页，修改页面等操作

地址空间是内核的核心数据结构，文件系统，块设备 cache都是以地址空间为中心的

内核使用radix_tree数据结构管理地址空间里的页

Struct Address_space中 radix_tree_root指向所有的radix tree

Page cache 被用作在radix tree 的顶部，page_cache_alloc用来分配新的页加入到页cache中去，这页就被加上__cold后缀来识别，page_cache_alloc_cold

起先，radix_tree没有被提到，因为内存是通过alloc_pages从buddy系统中来分配得到的

把页加入到缓存中的函数是add_to_page_cache，调用的函数radix_tree_insert则是把页加入到radix_tree

通常页的读取是顺利的，linux为了提高I/O的性能，尽量减少I/O请求的发送，主要是为了提高进程顺序读取文件的效率，提供了readahead的预读机制，如果linux判断一个进程在顺序读取文件，那么它会提前读取进程所需文件的数据，放在缓存中。

判断的标准是：

1.如果进程是第一次读取文件，那么检查进程是不是读取文件的第一个page

2.读取的文件page是不是上次读取page的下一个

满足上面两个条件的进程，linux会为其创建两个window，每个window包含几个page。一个被称为current window；另一个被称为ahead window。current window是进程正在读取的文件page 数据（也可能包含部分ahead数据），ahead window是预读取的文件数据。当current window里的page数据都被进程读取后，ahead window就成为了current window，linux将会申请下一个ahead window。

当进程I/O操作不满足前面的条件时，read ahead便会暂时失效，等条件再次被满足时，read ahead机制就可以重新被激活。

在linux kernel 2.6.23后内核新增了一个页面标志位:PG_readahead,它是“请作异步预读”的一个提示在每次进行新预读时,算法都会选择其中的一个新页面并标记之。预读规则相应的改为:
◆ 当读到缺失页面(missing page),进行同步预读;
◆ 当读到预读页面(PG_readahead page),进行异步预读。
这样一来,ahead预读窗口就不需要了:它实际上是把预读大小和提前量两者作了不必要的绑定。

Buffer cache

struct buffer_head {

unsigned long b_state; /* buffer state bitmap (see above) */

struct buffer_head *b_this_page;/* circular list of page's buffers */

struct page *b_page; /* the page this bh is mapped to */

sector_t b_blocknr; /* start block number */

size_t b_size; /* size of mapping */

char *b_data; /* pointer to data within the page */

struct block_device *b_bdev;

bh_end_io_t *b_end_io; /* I/O completion */

void *b_private; /* reserved for b_end_io */

struct list_head b_assoc_buffers; /* associated with another mapping */

struct address_space *b_assoc_map; /* mapping this buffer is

associated with */

atomic_t b_count; /* users using this buffer_head */

};

在buffer能被使用之前，内核必须首先建立一个buffer_Head结构，建立新的buffer_head被频繁的调用，所以尽可能快的处理，最经典的做法是用slab cache内核代码里已经有相关的函数操作了alloc_buffer_head 产生新的buffer head, 以及 free_buffer_head 用于销毁一个已存在的buffer head

Buffer heads适用于连接内存中有用的数据，页和buffer_head在内核中是通过create_empty_buffers and link_dev_buffers 两个函数联系起来的。link_dev_buffers用来连接一个已存在的buffer_head和页，create_empty_buffer当调用block_read_full_page和__block_write_full_page.函数读写页时被唤醒。

create_empty_buffers f首先调用函数 alloc_page_buffers建立需要的buffer heads。当然在buffers 和页之间建立连接如果内有内核其他部分的协助也是没有明显效果的，比如有些块设备的传输操作用到的传输单位就依赖于用到底层设备块大小

下图是块设备的读写操作：

LRU buffer：

Buffers不止应用于页面，在早前的Linux内核中，所有的buffer里缓存和页缓存是无关的，现在的内核中还有这种应用，比如在块层上访问块设备的数据而非页层，为了提高访问的速度，Linux内核提供了另外一种cache（LRU buffer）。

独立buffer里的缓存不是完全来自于页缓存，内存使用页的方式来管理的，被缓存的块也被写入页里。

为了提高查找的速度，内核每次会先去查看cahce的入口。如果找到了需要的数据，那cahce的数据就能用，如果没找到内核为了渠道数据必须提交一个到块设备底层的请求。

最近使用的数据被存到cache里最开始的位置。过段时间不用的cache数据就被丢弃。Linux内核主要用到两个函数管理LRU cache分别是lookup_bh_lru用于检查缓存的入口是否存在，bh_lru_install用于加入新的buffer head到缓存里。这两个函数是封装的因此内核是不直接调用，而是通过通用的接口函数__bread 和 __getblk实现。这两个函数的区别是bread保证返回的是最新的缓存，如果有必要就访问底层的设备。

__getblk函数的调用图：

首先调用find_get_block，lookup_bh_lru检查cache 是否在如果成功返回buffer head

Getblk_slow则是另外一条路，在getblk_slow会调grow_buffer调用grow_dev_page

Linux内核代码分析：

在内核里把一个新的页加入到cache里的调用地方有五处，第一处是do_generic_file_read在读取设备文件时

do_generic_file_read（）

{

no_cached_page:

page = page_cache_alloc_cold(mapping); //没有搜索到指定的页。则重新分配页

error = add_to_page_cache_lru(page, mapping,

index, GFP_KERNEL); //把页加入到高速缓存中去

}

第二处是：page_cache_read（）

第三处是：__read_cache_page（）

第四处是在文件预读的地方

阅读(2531) | 评论(0) | 转发(0) |

上一篇：Linux进程调度模拟3

下一篇：使用交叉编译的mtd-utils工具mtd_debug查看分区信息以及读取flash磁盘数据

给主人留下些什么吧！~~

感谢所有关心和支持过ChinaUnix的朋友们

16024965号-6