分类: LINUX
2010-12-30 20:31:26
见/mm/filemap.c
/*
* Write to a file through the page cache.
*
* We currently put everything into the page cache prior to writing it.
* This is not a problem when writing full pages. With partial pages,
* however, we first have to read the data into the cache, then
* dirty the page, and finally schedule it for writing. Alternatively, we
* could write-through just the portion of data that would go into that
* page, but that would kill performance for applications that write data
* line by line, and it's prone to race conditions.
*
* Note that this routine doesn't try to keep track of dirty pages. Each
* file system has to do this all by itself, unfortunately.
* okir@monad.swb.de
*/
ssize_t
generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
{
struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
struct inode *inode = mapping->host;
unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
loff_t pos;
/*
*不存在于page cache的时候分配的页面
*/
struct page *page, *cached_page;
ssize_t written;
long status = 0;
int err;
unsigned bytes;
/*验证count有效*/
if ((ssize_t) count < 0)
return -EINVAL;
/*验证用户态地址空间buf有效,可读*/
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
cached_page = NULL;
/*
*获取信号量i_sem 一次只能有一个进程对文件发出write系统调用
*/
down(&inode->i_sem);
pos = *ppos; /*文件的偏移量,即当前位置*/
err = -EINVAL;
if (pos < 0)
goto out;
err = file->f_error;
if (err) {
file->f_error = 0;
goto out;
}
written = 0;
/* FIXME: this is for backwards compatibility with 2.4 */
/*
*如果文件为普通文件且设置了O_APPEND
*则把*ppos设置为文件结束,i_size为文件大小
*新数据均为追加
*/
if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
pos = inode->i_size;
/*
* Check whether we've reached the file size limit.
*/
/*
* 执行文件大小检查.
*/
err = -EFBIG;
if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
if (pos >= limit) {
send_sig(SIGXFSZ, current, 0);
goto out;
}
/* Fix this up when we got to rlimit64 */
if (pos > 0xFFFFFFFFULL)
count = 0;
else if(count > limit - (u32)pos) {
/* send_sig(SIGXFSZ, current, 0); */
count = limit - (u32)pos;
}
}
/*
* LFS rule
*/
if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
if (pos >= MAX_NON_LFS) {
send_sig(SIGXFSZ, current, 0);
goto out;
}
if (count > MAX_NON_LFS - (u32)pos) {
/* send_sig(SIGXFSZ, current, 0); */
count = MAX_NON_LFS - (u32)pos;
}
}
/*
* Are we about to exceed the fs block limit ?
*
* If we have written data it becomes a short write
* If we have exceeded without writing data we send
* a signal and give them an EFBIG.
*
* Linus frestrict idea will clean these up nicely..
*/
if (!S_ISBLK(inode->i_mode)) {
if (pos >= inode->i_sb->s_maxbytes)
{
if (count || pos > inode->i_sb->s_maxbytes) {
send_sig(SIGXFSZ, current, 0);
err = -EFBIG;
goto out;
}
/* zero-length writes at ->s_maxbytes are OK */
}
if (pos + count > inode->i_sb->s_maxbytes)
count = inode->i_sb->s_maxbytes - pos;
} else {
if (is_read_only(inode->i_rdev)) {
err = -EPERM;
goto out;
}
if (pos >= inode->i_size) {
if (count || pos > inode->i_size) {
err = -ENOSPC;
goto out;
}
}
if (pos + count > inode->i_size)
count = inode->i_size - pos;
}
err = 0;
if (count == 0)
goto out;
remove_suid(inode);
/*
*上次修改索引节点时间合上次写文件时间为当前时间
*索引节点对象标记为脏
*/
inode->i_ctime = inode->i_mtime = CURRENT_TIME;
mark_inode_dirty_sync(inode);
/*
*检查O_DIRECT标志,若设置则写操作绕过page cache
*/
if (file->f_flags & O_DIRECT)
goto o_direct;
/*
*未设置O_DIRECT标志,按页循环写
*/
do {
unsigned long index, offset;
long page_fault;
char *kaddr;
int deactivate = 1;
/*
* Try to find the page in the cache. If it isn't there,
* allocate a free page.
*/
/*该页面起点offset*/
offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
/*要写的缓冲页面逻辑序号index*/
index = pos >> PAGE_CACHE_SHIFT;
/*写入长度bytes*/
bytes = PAGE_CACHE_SIZE - offset;
if (bytes > count) {
bytes = count;
deactivate = 0;
}
/*
* Bring in the user page that we will copy from _first_.
* Otherwise there's a nasty deadlock on copying from the
* same page as we're writing to, without it being marked
* up-to-date.
*/
{ volatile unsigned char dummy;
__get_user(dummy, buf);
__get_user(dummy, buf+bytes-1);
}
status = -ENOMEM; /* we'll assign it later anyway */
/*
* 在页高速缓存中找页,如果页没有缓存,调用page_cache_alloc分配新页面,
* 且调用add_to_page_cache_unique 通过他来调用__add_to_page_cache
*将新页面加入高速缓存
* 包括inode queue(clean pages)和hash表
* & 调用lru_cache_add将其加入LRU
* 锁住页 PG_locked标志
* 增加页引用计数器,其count字段
*/
page = __grab_cache_page(mapping, index, &cached_page);
if (!page)
break;
/* We have exclusive IO access to the page.. */
if (!PageLocked(page)) {
PAGE_BUG(page);
}
/*
*获得页的起始线性地址
*/
kaddr = kmap(page);
/*
*调用索引节点的address_space对象prepare_write分配相应的bh结构,
*建立队列然后对队列进行初始化
*还考虑调用ll_rw_block从磁盘中读取一些缓冲区(如果有必要)
*/
status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
if (status)
goto sync_failure;
/*
*把用户态下缓冲区中的字符copy到页中
*/
page_fault = __copy_from_user(kaddr+offset, buf, bytes);
flush_dcache_page(page);
conditional_schedule();
/*
*调用索引节点的address_space对象commit_write
*把buffers标记为脏,以便随后将其写入磁盘
*/
status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
if (page_fault)
goto fail_write;
if (!status)
status = bytes;
if (status >= 0) {
written += status;
count -= status;
pos += status;
buf += status;
}
unlock:
kunmap(page);
/* Mark it unlocked again and drop the page.. */
UnlockPage(page);
if (deactivate)
deactivate_page(page);
else
mark_page_accessed(page);
page_cache_release(page);
if (status < 0)
break;
} while (count);
done:
*ppos = pos;
if (cached_page)
page_cache_release(cached_page);
/* For now, when the user asks for O_SYNC, we'll actually
* provide O_DSYNC. */
if (status >= 0) {
if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
}
out_status:
err = written ? written : status;
out:
up(&inode->i_sem);
return err;
fail_write:
status = -EFAULT;
goto unlock;
sync_failure:
/*
* If blocksize < pagesize, prepare_write() may have instantiated a
* few blocks outside i_size. Trim these off again.
*/
kunmap(page);
UnlockPage(page);
page_cache_release(page);
if (pos + bytes > inode->i_size)
vmtruncate(inode, inode->i_size);
goto done;
o_direct:
written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
if (written > 0) {
loff_t end = pos + written;
if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
inode->i_size = end;
mark_inode_dirty(inode);
}
*ppos = end;
invalidate_inode_pages2(mapping);
}
/*
* Sync the fs metadata but not the minor inode changes and
* of course not the data as we did direct DMA for the IO.
*/
if (written >= 0 && file->f_flags & O_SYNC)
status = generic_osync_inode(inode, OSYNC_METADATA);
goto out_status;
}