Chinaunix首页 | 论坛 | 博客
  • 博客访问: 286065
  • 博文数量: 60
  • 博客积分: 2697
  • 博客等级: 少校
  • 技术积分: 653
  • 用 户 组: 普通用户
  • 注册时间: 2010-07-13 15:52
文章分类

全部博文(60)

文章存档

2012年(6)

2011年(31)

2010年(23)

分类: LINUX

2010-12-30 20:31:26

见/mm/filemap.c

 

/*
 * Write to a file through the page cache.
 *
 * We currently put everything into the page cache prior to writing it.
 * This is not a problem when writing full pages. With partial pages,
 * however, we first have to read the data into the cache, then
 * dirty the page, and finally schedule it for writing. Alternatively, we
 * could write-through just the portion of data that would go into that
 * page, but that would kill performance for applications that write data
 * line by line, and it's prone to race conditions.
 *
 * Note that this routine doesn't try to keep track of dirty pages. Each
 * file system has to do this all by itself, unfortunately.
 *                          okir@monad.swb.de
 */
ssize_t
generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
{
    struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
    struct inode    *inode = mapping->host;
    unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
    loff_t      pos;

    /*

     *不存在于page cache的时候分配的页面

     */

    struct page *page, *cached_page;

    ssize_t     written;
    long        status = 0;
    int     err;
    unsigned    bytes;
 

    /*验证count有效*/
    if ((ssize_t) count < 0)
        return -EINVAL;
 

    /*验证用户态地址空间buf有效,可读*/
    if (!access_ok(VERIFY_READ, buf, count))
        return -EFAULT;

    cached_page = NULL;
 

     /*

     *获取信号量i_sem 一次只能有一个进程对文件发出write系统调用

     */
    down(&inode->i_sem);

    pos = *ppos;   /*
文件的偏移量,即当前位置*/

    err = -EINVAL;
    if (pos < 0)
        goto out;

    err = file->f_error;
    if (err) {
        file->f_error = 0;
        goto out;
    }

    written = 0;

    /* FIXME: this is for backwards compatibility with 2.4 */

    /*

     *如果文件为普通文件且设置了O_APPEND

     *则把*ppos设置为文件结束,i_size为文件大小

     *新数据均为追加

     */
    if (!S_ISBLK(inode->i_mode) && file->f_flags & O_APPEND)
        pos = inode->i_size;

    /*
     * Check whether we've reached the file size limit.
     */

    /*

     * 执行文件大小检查.

     */
    err = -EFBIG;

    if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
        if (pos >= limit) {
            send_sig(SIGXFSZ, current, 0);
            goto out;
        }
        /* Fix this up when we got to rlimit64 */
        if (pos > 0xFFFFFFFFULL)
            count = 0;
        else if(count > limit - (u32)pos) {
            /* send_sig(SIGXFSZ, current, 0); */
            count = limit - (u32)pos;
        }
    }

    /*
     *  LFS rule
     */
    if ( pos + count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
        if (pos >= MAX_NON_LFS) {
            send_sig(SIGXFSZ, current, 0);
            goto out;
        }
        if (count > MAX_NON_LFS - (u32)pos) {
            /* send_sig(SIGXFSZ, current, 0); */
            count = MAX_NON_LFS - (u32)pos;
        }
    }

    /*
     *  Are we about to exceed the fs block limit ?
     *
     *  If we have written data it becomes a short write
     *  If we have exceeded without writing data we send
     *  a signal and give them an EFBIG.
     *
     *  Linus frestrict idea will clean these up nicely..
     */
     
    if (!S_ISBLK(inode->i_mode)) {
        if (pos >= inode->i_sb->s_maxbytes)
        {
            if (count || pos > inode->i_sb->s_maxbytes) {
                send_sig(SIGXFSZ, current, 0);
                err = -EFBIG;
                goto out;
            }
            /* zero-length writes at ->s_maxbytes are OK */
        }

        if (pos + count > inode->i_sb->s_maxbytes)
            count = inode->i_sb->s_maxbytes - pos;
    } else {
        if (is_read_only(inode->i_rdev)) {
            err = -EPERM;
            goto out;
        }
        if (pos >= inode->i_size) {
            if (count || pos > inode->i_size) {
                err = -ENOSPC;
                goto out;
            }
        }

        if (pos + count > inode->i_size)
            count = inode->i_size - pos;
    }

    err = 0;
    if (count == 0)
        goto out;

    remove_suid(inode);

    /*

     *上次修改索引节点时间合上次写文件时间为当前时间

     *索引节点对象标记为脏

     */
    inode->i_ctime = inode->i_mtime = CURRENT_TIME;
    mark_inode_dirty_sync(inode);
 

    /*

     *检查O_DIRECT标志,若设置则写操作绕过page cache

     */
    if (file->f_flags & O_DIRECT)
        goto o_direct;

 

    /*

     *未设置O_DIRECT标志,按页循环写

     */

    do {
        unsigned long index, offset;
        long page_fault;
        char *kaddr;
        int deactivate = 1;

        /*
         * Try to find the page in the cache. If it isn't there,
         * allocate a free page.
         */

        /*该页面起点offset*/

        offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
       

        /*要写的缓冲页面逻辑序号index*/

        index = pos >> PAGE_CACHE_SHIFT;
       

        /*写入长度bytes*/

        bytes = PAGE_CACHE_SIZE - offset;

        if (bytes > count) {
            bytes = count;
            deactivate = 0;
        }

        /*
         * Bring in the user page that we will copy from _first_.
         * Otherwise there's a nasty deadlock on copying from the
         * same page as we're writing to, without it being marked
         * up-to-date.
         */
        { volatile unsigned char dummy;
            __get_user(dummy, buf);
            __get_user(dummy, buf+bytes-1);
        }

        status = -ENOMEM;   /* we'll assign it later anyway */

 

        /*

         * 在页高速缓存中找页,如果页没有缓存,调用page_cache_alloc分配新页面,

         * 且调用add_to_page_cache_unique 通过他来调用__add_to_page_cache

         *将新页面加入高速缓存

         * 包括inode queue(clean pages)hash

         * & 调用lru_cache_add将其加入LRU

         * 锁住页 PG_locked标志

         * 增加页引用计数器,其count字段

         */
        page = __grab_cache_page(mapping, index, &cached_page);
        if (!page)
            break;

        /* We have exclusive IO access to the page.. */
        if (!PageLocked(page)) {
            PAGE_BUG(page);
        }
 

        /*

         *获得页的起始线性地址

         */
        kaddr = kmap(page);

 

        /*

         *调用索引节点的address_space对象prepare_write分配相应的bh结构,

         *建立队列然后对队列进行初始化

         *还考虑调用ll_rw_block从磁盘中读取一些缓冲区(如果有必要)

         *详见block_prepare_write解析一文

         */
        status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
        if (status)
            goto sync_failure;

 

        /*

         *把用户态下缓冲区中的字符copy到页中

         */
        page_fault = __copy_from_user(kaddr+offset, buf, bytes);
        flush_dcache_page(page);

                conditional_schedule();
        /*

         *调用索引节点的address_space对象commit_write

         *buffers标记为脏,以便随后将其写入磁盘

         *详见generic_commit_write解析一文

         */
        status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
        if (page_fault)
            goto fail_write;
        if (!status)
            status = bytes;

        if (status >= 0) {
            written += status;
            count -= status;
            pos += status;
            buf += status;
        }
unlock:
        kunmap(page);
        /* Mark it unlocked again and drop the page.. */
        UnlockPage(page);
        if (deactivate)
            deactivate_page(page);
        else
            mark_page_accessed(page);
        page_cache_release(page);

        if (status < 0)
            break;
    } while (count);
done:
    *ppos = pos;

    if (cached_page)
        page_cache_release(cached_page);

    /* For now, when the user asks for O_SYNC, we'll actually
     * provide O_DSYNC. */
    if (status >= 0) {
        if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
            status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
    }
   
out_status:
    err = written ? written : status;
out:

    up(&inode->i_sem);
    return err;
fail_write:
    status = -EFAULT;
    goto unlock;

sync_failure:
    /*
     * If blocksize < pagesize, prepare_write() may have instantiated a
     * few blocks outside i_size.  Trim these off again.
     */
    kunmap(page);
    UnlockPage(page);
    page_cache_release(page);
    if (pos + bytes > inode->i_size)
        vmtruncate(inode, inode->i_size);
    goto done;

o_direct:
    written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
    if (written > 0) {
        loff_t end = pos + written;
        if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
            inode->i_size = end;
            mark_inode_dirty(inode);
        }
        *ppos = end;
        invalidate_inode_pages2(mapping);
    }
    /*
     * Sync the fs metadata but not the minor inode changes and
     * of course not the data as we did direct DMA for the IO.
     */
    if (written >= 0 && file->f_flags & O_SYNC)
        status = generic_osync_inode(inode, OSYNC_METADATA);
    goto out_status;
}

阅读(2350) | 评论(0) | 转发(0) |
给主人留下些什么吧!~~