/* * linux/fs/buffer.c * * Copyright (C) 1991, 1992, 2002 Linus Torvalds *//* * Start bdflush() with kernel_thread not syscall - Paul Gortmaker, 12/95 * * Removed a lot of unnecessary code and simplified things now that * the buffer cache isn't our primary cache - Andrew Tridgell 12/96 * * Speed up hash, lru, and free list operations. Use gfp() for allocating * hash table, use SLAB cache for buffer heads. SMP threading. -DaveM * * Added 32k buffer block sizes - these are required older ARM systems. - RMK * * async buffer flushing, 1999 Andrea Arcangeli <andrea@suse.de> */#include<linux/kernel.h>#include<linux/sched/signal.h>#include<linux/syscalls.h>#include<linux/fs.h>#include<linux/iomap.h>#include<linux/mm.h>#include<linux/percpu.h>#include<linux/slab.h>#include<linux/capability.h>#include<linux/blkdev.h>#include<linux/file.h>#include<linux/quotaops.h>#include<linux/highmem.h>#include<linux/export.h>#include<linux/backing-dev.h>#include<linux/writeback.h>#include<linux/hash.h>#include<linux/suspend.h>#include<linux/buffer_head.h>#include<linux/task_io_accounting_ops.h>#include<linux/bio.h>#include<linux/notifier.h>#include<linux/cpu.h>#include<linux/bitops.h>#include<linux/mpage.h>#include<linux/bit_spinlock.h>#include<linux/pagevec.h>#include<trace/events/block.h>staticintfsync_buffers_list(spinlock_t*lock,structlist_head*list);staticintsubmit_bh_wbc(intop,intop_flags,structbuffer_head*bh,enumrw_hinthint,structwriteback_control*wbc);#define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers)inlinevoidtouch_buffer(structbuffer_head*bh){trace_block_touch_buffer(bh);mark_page_accessed(bh->b_page);}EXPORT_SYMBOL(touch_buffer);void__lock_buffer(structbuffer_head*bh){wait_on_bit_lock_io(&bh->b_state,BH_Lock,TASK_UNINTERRUPTIBLE);}EXPORT_SYMBOL(__lock_buffer);voidunlock_buffer(structbuffer_head*bh){clear_bit_unlock(BH_Lock,&bh->b_state);smp_mb__after_atomic();wake_up_bit(&bh->b_state,BH_Lock);}EXPORT_SYMBOL(unlock_buffer);/* * Returns if the page has dirty or writeback buffers. If all the buffers * are unlocked and clean then the PageDirty information is stale. If * any of the pages are locked, it is assumed they are locked for IO. */voidbuffer_check_dirty_writeback(structpage*page,bool*dirty,bool*writeback){structbuffer_head*head,*bh;*dirty=false;*writeback=false;BUG_ON(!PageLocked(page));if(!page_has_buffers(page))return;if(PageWriteback(page))*writeback=true;head=page_buffers(page);bh=head;do{if(buffer_locked(bh))*writeback=true;if(buffer_dirty(bh))*dirty=true;bh=bh->b_this_page;}while(bh!=head);}EXPORT_SYMBOL(buffer_check_dirty_writeback);/* * Block until a buffer comes unlocked. This doesn't stop it * from becoming locked again - you have to lock it yourself * if you want to preserve its state. */void__wait_on_buffer(structbuffer_head*bh){wait_on_bit_io(&bh->b_state,BH_Lock,TASK_UNINTERRUPTIBLE);}EXPORT_SYMBOL(__wait_on_buffer);staticvoid__clear_page_buffers(structpage*page){ClearPagePrivate(page);set_page_private(page,0);put_page(page);}staticvoidbuffer_io_error(structbuffer_head*bh,char*msg){if(!test_bit(BH_Quiet,&bh->b_state))printk_ratelimited(KERN_ERR"Buffer I/O error on dev %pg, logical block %llu%s\n",bh->b_bdev,(unsignedlonglong)bh->b_blocknr,msg);}/* * End-of-IO handler helper function which does not touch the bh after * unlocking it. * Note: unlock_buffer() sort-of does touch the bh after unlocking it, but * a race there is benign: unlock_buffer() only use the bh's address for * hashing after unlocking the buffer, so it doesn't actually touch the bh * itself. */staticvoid__end_buffer_read_notouch(structbuffer_head*bh,intuptodate){if(uptodate){set_buffer_uptodate(bh);}else{/* This happens, due to failed read-ahead attempts. */clear_buffer_uptodate(bh);}unlock_buffer(bh);}/* * Default synchronous end-of-IO handler.. Just mark it up-to-date and * unlock the buffer. This is what ll_rw_block uses too. */voidend_buffer_read_sync(structbuffer_head*bh,intuptodate){__end_buffer_read_notouch(bh,uptodate);put_bh(bh);}EXPORT_SYMBOL(end_buffer_read_sync);voidend_buffer_write_sync(structbuffer_head*bh,intuptodate){if(uptodate){set_buffer_uptodate(bh);}else{buffer_io_error(bh,", lost sync page write");mark_buffer_write_io_error(bh);clear_buffer_uptodate(bh);}unlock_buffer(bh);put_bh(bh);}EXPORT_SYMBOL(end_buffer_write_sync);/* * Various filesystems appear to want __find_get_block to be non-blocking. * But it's the page lock which protects the buffers. To get around this, * we get exclusion from try_to_free_buffers with the blockdev mapping's * private_lock. * * Hack idea: for the blockdev mapping, private_lock contention * may be quite high. This code could TryLock the page, and if that * succeeds, there is no need to take private_lock. */staticstructbuffer_head*__find_get_block_slow(structblock_device*bdev,sector_tblock){structinode*bd_inode=bdev->bd_inode;structaddress_space*bd_mapping=bd_inode->i_mapping;structbuffer_head*ret=NULL;pgoff_tindex;structbuffer_head*bh;structbuffer_head*head;structpage*page;intall_mapped=1;index=block>>(PAGE_SHIFT-bd_inode->i_blkbits);page=find_get_page_flags(bd_mapping,index,FGP_ACCESSED);if(!page)gotoout;spin_lock(&bd_mapping->private_lock);if(!page_has_buffers(page))gotoout_unlock;head=page_buffers(page);bh=head;do{if(!buffer_mapped(bh))all_mapped=0;elseif(bh->b_blocknr==block){ret=bh;get_bh(bh);gotoout_unlock;}bh=bh->b_this_page;}while(bh!=head);/* we might be here because some of the buffers on this page are * not mapped. This is due to various races between * file io on the block device and getblk. It gets dealt with * elsewhere, don't buffer_error if we had some unmapped buffers */if(all_mapped){printk("__find_get_block_slow() failed. ""block=%llu, b_blocknr=%llu\n",(unsignedlonglong)block,(unsignedlonglong)bh->b_blocknr);printk("b_state=0x%08lx, b_size=%zu\n",bh->b_state,bh->b_size);printk("device %pg blocksize: %d\n",bdev,1<<bd_inode->i_blkbits);}out_unlock:spin_unlock(&bd_mapping->private_lock);put_page(page);out:returnret;}/* * I/O completion handler for block_read_full_page() - pages * which come unlocked at the end of I/O. */staticvoidend_buffer_async_read(structbuffer_head*bh,intuptodate){unsignedlongflags;structbuffer_head*first;structbuffer_head*tmp;structpage*page;intpage_uptodate=1;BUG_ON(!buffer_async_read(bh));page=bh->b_page;if(uptodate){set_buffer_uptodate(bh);}else{clear_buffer_uptodate(bh);buffer_io_error(bh,", async page read");SetPageError(page);}/* * Be _very_ careful from here on. Bad things can happen if * two buffer heads end IO at almost the same time and both * decide that the page is now completely done. */first=page_buffers(page);local_irq_save(flags);bit_spin_lock(BH_Uptodate_Lock,&first->b_state);clear_buffer_async_read(bh);unlock_buffer(bh);tmp=bh;do{if(!buffer_uptodate(tmp))page_uptodate=0;if(buffer_async_read(tmp)){BUG_ON(!buffer_locked(tmp));gotostill_busy;}tmp=tmp->b_this_page;}while(tmp!=bh);bit_spin_unlock(BH_Uptodate_Lock,&first->b_state);local_irq_restore(flags);/* * If none of the buffers had errors and they are all * uptodate then we can set the page uptodate. */if(page_uptodate&&!PageError(page))SetPageUptodate(page);unlock_page(page);return;still_busy:bit_spin_unlock(BH_Uptodate_Lock,&first->b_state);local_irq_restore(flags);return;}/* * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */voidend_buffer_async_write(structbuffer_head*bh,intuptodate){unsignedlongflags;structbuffer_head*first;structbuffer_head*tmp;structpage*page;BUG_ON(!buffer_async_write(bh));page=bh->b_page;if(uptodate){set_buffer_uptodate(bh);}else{buffer_io_error(bh,", lost async page write");mark_buffer_write_io_error(bh);clear_buffer_uptodate(bh);SetPageError(page);}first=page_buffers(page);local_irq_save(flags);bit_spin_lock(BH_Uptodate_Lock,&first->b_state);clear_buffer_async_write(bh);unlock_buffer(bh);tmp=bh->b_this_page;while(tmp!=bh){if(buffer_async_write(tmp)){BUG_ON(!buffer_locked(tmp));gotostill_busy;}tmp=tmp->b_this_page;}bit_spin_unlock(BH_Uptodate_Lock,&first->b_state);local_irq_restore(flags);end_page_writeback(page);return;still_busy:bit_spin_unlock(BH_Uptodate_Lock,&first->b_state);local_irq_restore(flags);return;}EXPORT_SYMBOL(end_buffer_async_write);/* * If a page's buffers are under async readin (end_buffer_async_read * completion) then there is a possibility that another thread of * control could lock one of the buffers after it has completed * but while some of the other buffers have not completed. This * locked buffer would confuse end_buffer_async_read() into not unlocking * the page. So the absence of BH_Async_Read tells end_buffer_async_read() * that this buffer is not under async I/O. * * The page comes unlocked when it has no locked buffer_async buffers * left. * * PageLocked prevents anyone starting new async I/O reads any of * the buffers. * * PageWriteback is used to prevent simultaneous writeout of the same * page. * * PageLocked prevents anyone from starting writeback of a page which is * under read I/O (PageWriteback is only ever set against a locked page). */staticvoidmark_buffer_async_read(structbuffer_head*bh){bh->b_end_io=end_buffer_async_read;set_buffer_async_read(bh);}staticvoidmark_buffer_async_write_endio(structbuffer_head*bh,bh_end_io_t*handler){bh->b_end_io=handler;set_buffer_async_write(bh);}voidmark_buffer_async_write(structbuffer_head*bh){mark_buffer_async_write_endio(bh,end_buffer_async_write);}EXPORT_SYMBOL(mark_buffer_async_write);/* * fs/buffer.c contains helper functions for buffer-backed address space's * fsync functions. A common requirement for buffer-based filesystems is * that certain data from the backing blockdev needs to be written out for * a successful fsync(). For example, ext2 indirect blocks need to be * written back and waited upon before fsync() returns. * * The functions mark_buffer_inode_dirty(), fsync_inode_buffers(), * inode_has_buffers() and invalidate_inode_buffers() are provided for the * management of a list of dependent buffers at ->i_mapping->private_list. * * Locking is a little subtle: try_to_free_buffers() will remove buffers * from their controlling inode's queue when they are being freed. But * try_to_free_buffers() will be operating against the *blockdev* mapping * at the time, not against the S_ISREG file which depends on those buffers. * So the locking for private_list is via the private_lock in the address_space * which backs the buffers. Which is different from the address_space * against which the buffers are listed. So for a particular address_space, * mapping->private_lock does *not* protect mapping->private_list! In fact, * mapping->private_list will always be protected by the backing blockdev's * ->private_lock. * * Which introduces a requirement: all buffers on an address_space's * ->private_list must be from the same address_space: the blockdev's. * * address_spaces which do not place buffers at ->private_list via these * utility functions are free to use private_lock and private_list for * whatever they want. The only requirement is that list_empty(private_list) * be true at clear_inode() time. * * FIXME: clear_inode should not call invalidate_inode_buffers(). The * filesystems should do that. invalidate_inode_buffers() should just go * BUG_ON(!list_empty). * * FIXME: mark_buffer_dirty_inode() is a data-plane operation. It should * take an address_space, not an inode. And it should be called * mark_buffer_dirty_fsync() to clearly define why those buffers are being * queued up. * * FIXME: mark_buffer_dirty_inode() doesn't need to add the buffer to the * list if it is already on a list. Because if the buffer is on a list, * it *must* already be on the right one. If not, the filesystem is being * silly. This will save a ton of locking. But first we have to ensure * that buffers are taken *off* the old inode's list when they are freed * (presumably in truncate). That requires careful auditing of all * filesystems (do it inside bforget()). It could also be done by bringing * b_inode back. *//* * The buffer's backing address_space's private_lock must be held */staticvoid__remove_assoc_queue(structbuffer_head*bh){list_del_init(&bh->b_assoc_buffers);WARN_ON(!bh->b_assoc_map);bh->b_assoc_map=NULL;}intinode_has_buffers(structinode*inode){return!list_empty(&inode->i_data.private_list);}/* * osync is designed to support O_SYNC io. It waits synchronously for * all already-submitted IO to complete, but does not queue any new * writes to the disk. * * To do O_SYNC writes, just queue the buffer writes with ll_rw_block as * you dirty the buffers, and then use osync_inode_buffers to wait for * completion. Any other dirty buffers which are not yet queued for * write will not be flushed to disk by the osync. */staticintosync_buffers_list(spinlock_t*lock,structlist_head*list){structbuffer_head*bh;structlist_head*p;interr=0;spin_lock(lock);repeat:list_for_each_prev(p,list){bh=BH_ENTRY(p);if(buffer_locked(bh)){get_bh(bh);spin_unlock(lock);wait_on_buffer(bh);if(!buffer_uptodate(bh))err=-EIO;brelse(bh);spin_lock(lock);gotorepeat;}}spin_unlock(lock);returnerr;}voidemergency_thaw_bdev(structsuper_block*sb){while(sb->s_bdev&&!thaw_bdev(sb->s_bdev,sb))printk(KERN_WARNING"Emergency Thaw on %pg\n",sb->s_bdev);}/** * sync_mapping_buffers - write out & wait upon a mapping's "associated" buffers * @mapping: the mapping which wants those buffers written * * Starts I/O against the buffers at mapping->private_list, and waits upon * that I/O. * * Basically, this is a convenience function for fsync(). * @mapping is a file or directory which needs those buffers to be written for * a successful fsync(). */intsync_mapping_buffers(structaddress_space*mapping){structaddress_space*buffer_mapping=mapping->private_data;if(buffer_mapping==NULL||list_empty(&mapping->private_list))return0;returnfsync_buffers_list(&buffer_mapping->private_lock,&mapping->private_list);}EXPORT_SYMBOL(sync_mapping_buffers);/* * Called when we've recently written block `bblock', and it is known that * `bblock' was for a buffer_boundary() buffer. This means that the block at * `bblock + 1' is probably a dirty indirect block. Hunt it down and, if it's * dirty, schedule it for IO. So that indirects merge nicely with their data. */voidwrite_boundary_block(structblock_device*bdev,sector_tbblock,unsignedblocksize){structbuffer_head*bh=__find_get_block(bdev,bblock+1,blocksize);if(bh){if(buffer_dirty(bh))ll_rw_block(REQ_OP_WRITE,0,1,&bh);put_bh(bh);}}voidmark_buffer_dirty_inode(structbuffer_head*bh,structinode*inode){structaddress_space*mapping=inode->i_mapping;structaddress_space*buffer_mapping=bh->b_page->mapping;mark_buffer_dirty(bh);if(!mapping->private_data){mapping->private_data=buffer_mapping;}else{BUG_ON(mapping->private_data!=buffer_mapping);}if(!bh->b_assoc_map){spin_lock(&buffer_mapping->private_lock);list_move_tail(&bh->b_assoc_buffers,&mapping->private_list);bh->b_assoc_map=mapping;spin_unlock(&buffer_mapping->private_lock);}}EXPORT_SYMBOL(mark_buffer_dirty_inode);/* * Mark the page dirty, and set it dirty in the radix tree, and mark the inode * dirty. * * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. * * The caller must hold lock_page_memcg(). */void__set_page_dirty(structpage*page,structaddress_space*mapping,intwarn){unsignedlongflags;xa_lock_irqsave(&mapping->i_pages,flags);if(page->mapping){/* Race with truncate? */WARN_ON_ONCE(warn&&!PageUptodate(page));account_page_dirtied(page,mapping);radix_tree_tag_set(&mapping->i_pages,page_index(page),PAGECACHE_TAG_DIRTY);}xa_unlock_irqrestore(&mapping->i_pages,flags);}EXPORT_SYMBOL_GPL(__set_page_dirty);/* * Add a page to the dirty page list. * * It is a sad fact of life that this function is called from several places * deeply under spinlocking. It may not sleep. * * If the page has buffers, the uptodate buffers are set dirty, to preserve * dirty-state coherency between the page and the buffers. It the page does * not have buffers then when they are later attached they will all be set * dirty. * * The buffers are dirtied before the page is dirtied. There's a small race * window in which a writepage caller may see the page cleanness but not the * buffer dirtiness. That's fine. If this code were to set the page dirty * before the buffers, a concurrent writepage caller could clear the page dirty * bit, see a bunch of clean buffers and we'd end up with dirty buffers/clean * page on the dirty page list. * * We use private_lock to lock against try_to_free_buffers while using the * page's buffer list. Also use this to protect against clean buffers being * added to the page after it was set dirty. * * FIXME: may need to call ->reservepage here as well. That's rather up to the * address_space though. */int__set_page_dirty_buffers(structpage*page){intnewly_dirty;structaddress_space*mapping=page_mapping(page);if(unlikely(!mapping))return!TestSetPageDirty(page);spin_lock(&mapping->private_lock);if(page_has_buffers(page)){structbuffer_head*head=page_buffers(page);structbuffer_head*bh=head;do{set_buffer_dirty(bh);bh=bh->b_this_page;}while(bh!=head);}/* * Lock out page->mem_cgroup migration to keep PageDirty * synchronized with per-memcg dirty page counters. */lock_page_memcg(page);newly_dirty=!TestSetPageDirty(page);spin_unlock(&mapping->private_lock);if(newly_dirty)__set_page_dirty(page,mapping,1);unlock_page_memcg(page);if(newly_dirty)__mark_inode_dirty(mapping->host,I_DIRTY_PAGES);returnnewly_dirty;}EXPORT_SYMBOL(__set_page_dirty_buffers);/* * Write out and wait upon a list of buffers. * * We have conflicting pressures: we want to make sure that all * initially dirty buffers get waited on, but that any subsequently * dirtied buffers don't. After all, we don't want fsync to last * forever if somebody is actively writing to the file. * * Do this in two main stages: first we copy dirty buffers to a * temporary inode list, queueing the writes as we go. Then we clean * up, waiting for those writes to complete. * * During this second stage, any subsequent updates to the file may end * up refiling the buffer on the original inode's dirty list again, so * there is a chance we will end up with a buffer queued for write but * not yet completed on that list. So, as a final cleanup we go through * the osync code to catch these locked, dirty buffers without requeuing * any newly dirty buffers for write. */staticintfsync_buffers_list(spinlock_t*lock,structlist_head*list){structbuffer_head*bh;structlist_headtmp;structaddress_space*mapping;interr=0,err2;structblk_plugplug;INIT_LIST_HEAD(&tmp);blk_start_plug(&plug);spin_lock(lock);while(!list_empty(list)){bh=BH_ENTRY(list->next);mapping=bh->b_assoc_map;__remove_assoc_queue(bh);/* Avoid race with mark_buffer_dirty_inode() which does * a lockless check and we rely on seeing the dirty bit */smp_mb();if(buffer_dirty(bh)||buffer_locked(bh)){list_add(&bh->b_assoc_buffers,&tmp);bh->b_assoc_map=mapping;if(buffer_dirty(bh)){get_bh(bh);spin_unlock(lock);/* * Ensure any pending I/O completes so that * write_dirty_buffer() actually writes the * current contents - it is a noop if I/O is * still in flight on potentially older * contents. */write_dirty_buffer(bh,REQ_SYNC);/* * Kick off IO for the previous mapping. Note * that we will not run the very last mapping, * wait_on_buffer() will do that for us * through sync_buffer(). */brelse(bh);spin_lock(lock);}}}spin_unlock(lock);blk_finish_plug(&plug);spin_lock(lock);while(!list_empty(&tmp)){bh=BH_ENTRY(tmp.prev);get_bh(bh);mapping=bh->b_assoc_map;__remove_assoc_queue(bh);/* Avoid race with mark_buffer_dirty_inode() which does * a lockless check and we rely on seeing the dirty bit */smp_mb();if(buffer_dirty(bh)){list_add(&bh->b_assoc_buffers,&mapping->private_list);bh->b_assoc_map=mapping;}spin_unlock(lock);wait_on_buffer(bh);if(!buffer_uptodate(bh))err=-EIO;brelse(bh);spin_lock(lock);}spin_unlock(lock);err2=osync_buffers_list(lock,list);if(err)returnerr;elsereturnerr2;}/* * Invalidate any and all dirty buffers on a given inode. We are * probably unmounting the fs, but that doesn't mean we have already * done a sync(). Just drop the buffers from the inode list. * * NOTE: we take the inode's blockdev's mapping's private_lock. Which * assumes that all the buffers are against the blockdev. Not true * for reiserfs. */voidinvalidate_inode_buffers(structinode*inode){if(inode_has_buffers(inode)){structaddress_space*mapping=&inode->i_data;structlist_head*list=&mapping->private_list;structaddress_space*buffer_mapping=mapping->private_data;spin_lock(&buffer_mapping->private_lock);while(!list_empty(list))__remove_assoc_queue(BH_ENTRY(list->next));spin_unlock(&buffer_mapping->private_lock);}}EXPORT_SYMBOL(invalidate_inode_buffers);/* * Remove any clean buffers from the inode's buffer list. This is called * when we're trying to free the inode itself. Those buffers can pin it. * * Returns true if all buffers were removed. */intremove_inode_buffers(structinode*inode){intret=1;if(inode_has_buffers(inode)){structaddress_space*mapping=&inode->i_data;structlist_head*list=&mapping->private_list;structaddress_space*buffer_mapping=mapping->private_data;spin_lock(&buffer_mapping->private_lock);while(!list_empty(list)){structbuffer_head*bh=BH_ENTRY(list->next);if(buffer_dirty(bh)){ret=0;break;}__remove_assoc_queue(bh);}spin_unlock(&buffer_mapping->private_lock);}returnret;}/* * Create the appropriate buffers when given a page for data area and * the size of each buffer.. Use the bh->b_this_page linked list to * follow the buffers created. Return NULL if unable to create more * buffers. * * The retry flag is used to differentiate async IO (paging, swapping) * which may not fail from ordinary buffer allocations. */structbuffer_head*alloc_page_buffers(structpage*page,unsignedlongsize,boolretry){structbuffer_head*bh,*head;gfp_tgfp=GFP_NOFS;longoffset;if(retry)gfp|=__GFP_NOFAIL;head=NULL;offset=PAGE_SIZE;while((offset-=size)>=0){bh=alloc_buffer_head(gfp);if(!bh)gotono_grow;bh->b_this_page=head;bh->b_blocknr=-1;head=bh;bh->b_size=size;/* Link the buffer to its page */set_bh_page(bh,page,offset);}returnhead;/* * In case anything failed, we just free everything we got. */no_grow:if(head){do{bh=head;head=head->b_this_page;free_buffer_head(bh);}while(head);}returnNULL;}EXPORT_SYMBOL_GPL(alloc_page_buffers);staticinlinevoidlink_dev_buffers(structpage*page,structbuffer_head*head){structbuffer_head*bh,*tail;bh=head;do{tail=bh;bh=bh->b_this_page;}while(bh);tail->b_this_page=head;attach_page_buffers(page,head);}staticsector_tblkdev_max_block(structblock_device*bdev,unsignedintsize){sector_tretval=~((sector_t)0);loff_tsz=i_size_read(bdev->bd_inode);if(sz){unsignedintsizebits=blksize_bits(size);retval=(sz>>sizebits);}returnretval;}/* * Initialise the state of a blockdev page's buffers. */staticsector_tinit_page_buffers(structpage*page,structblock_device*bdev,sector_tblock,intsize){structbuffer_head*head=page_buffers(page);structbuffer_head*bh=head;intuptodate=PageUptodate(page);sector_tend_block=blkdev_max_block(I_BDEV(bdev->bd_inode),size);do{if(!buffer_mapped(bh)){bh->b_end_io=NULL;bh->b_private=NULL;bh->b_bdev=bdev;bh->b_blocknr=block;if(uptodate)set_buffer_uptodate(bh);if(block<end_block)set_buffer_mapped(bh);}block++;bh=bh->b_this_page;}while(bh!=head);/* * Caller needs to validate requested block against end of device. */returnend_block;}/* * Create the page-cache page that contains the requested block. * * This is used purely for blockdev mappings. */staticintgrow_dev_page(structblock_device*bdev,sector_tblock,pgoff_tindex,intsize,intsizebits,gfp_tgfp){structinode*inode=bdev->bd_inode;structpage*page;structbuffer_head*bh;sector_tend_block;intret=0;/* Will call free_more_memory() */gfp_tgfp_mask;gfp_mask=mapping_gfp_constraint(inode->i_mapping,~__GFP_FS)|gfp;/* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer * looping in the allocator rather than here, at least that * code knows what it's doing. */gfp_mask|=__GFP_NOFAIL;page=find_or_create_page(inode->i_mapping,index,gfp_mask);BUG_ON(!PageLocked(page));if(page_has_buffers(page)){bh=page_buffers(page);if(bh->b_size==size){end_block=init_page_buffers(page,bdev,(sector_t)index<<sizebits,size);gotodone;}if(!try_to_free_buffers(page))gotofailed;}/* * Allocate some buffers for this page */bh=alloc_page_buffers(page,size,true);/* * Link the page to the buffers and initialise them. Take the * lock to be atomic wrt __find_get_block(), which does not * run under the page lock. */spin_lock(&inode->i_mapping->private_lock);link_dev_buffers(page,bh);end_block=init_page_buffers(page,bdev,(sector_t)index<<sizebits,size);spin_unlock(&inode->i_mapping->private_lock);done:ret=(block<end_block)?1:-ENXIO;failed:unlock_page(page);put_page(page);returnret;}/* * Create buffers for the specified block device block's page. If * that page was dirty, the buffers are set dirty also. */staticintgrow_buffers(structblock_device*bdev,sector_tblock,intsize,gfp_tgfp){pgoff_tindex;intsizebits;sizebits=-1;do{sizebits++;}while((size<<sizebits)<PAGE_SIZE);index=block>>sizebits;/* * Check for a block which wants to lie outside our maximum possible * pagecache index. (this comparison is done using sector_t types). */if(unlikely(index!=block>>sizebits)){printk(KERN_ERR"%s: requested out-of-range block %llu for ""device %pg\n",__func__,(unsignedlonglong)block,bdev);return-EIO;}/* Create a page with the proper size buffers.. */returngrow_dev_page(bdev,block,index,size,sizebits,gfp);}staticstructbuffer_head*__getblk_slow(structblock_device*bdev,sector_tblock,unsignedsize,gfp_tgfp){/* Size must be multiple of hard sectorsize */if(unlikely(size&(bdev_logical_block_size(bdev)-1)||(size<512||size>PAGE_SIZE))){printk(KERN_ERR"getblk(): invalid block size %d requested\n",size);printk(KERN_ERR"logical block size: %d\n",bdev_logical_block_size(bdev));dump_stack();returnNULL;}for(;;){structbuffer_head*bh;intret;bh=__find_get_block(bdev,block,size);if(bh)returnbh;ret=grow_buffers(bdev,block,size,gfp);if(ret<0)returnNULL;}}/* * The relationship between dirty buffers and dirty pages: * * Whenever a page has any dirty buffers, the page's dirty bit is set, and * the page is tagged dirty in its radix tree. * * At all times, the dirtiness of the buffers represents the dirtiness of * subsections of the page. If the page has buffers, the page dirty bit is * merely a hint about the true dirty state. * * When a page is set dirty in its entirety, all its buffers are marked dirty * (if the page has buffers). * * When a buffer is marked dirty, its page is dirtied, but the page's other * buffers are not. * * Also. When blockdev buffers are explicitly read with bread(), they * individually become uptodate. But their backing page remains not * uptodate - even if all of its buffers are uptodate. A subsequent * block_read_full_page() against that page will discover all the uptodate * buffers, will set the page uptodate and will perform no I/O. *//** * mark_buffer_dirty - mark a buffer_head as needing writeout * @bh: the buffer_head to mark dirty * * mark_buffer_dirty() will set the dirty bit against the buffer, then set its * backing page dirty, then tag the page as dirty in its address_space's radix * tree and then attach the address_space's inode to its superblock's dirty * inode list. * * mark_buffer_dirty() is atomic. It takes bh->b_page->mapping->private_lock, * i_pages lock and mapping->host->i_lock. */voidmark_buffer_dirty(structbuffer_head*bh){WARN_ON_ONCE(!buffer_uptodate(bh));trace_block_dirty_buffer(bh);/* * Very *carefully* optimize the it-is-already-dirty case. * * Don't let the final "is it dirty" escape to before we * perhaps modified the buffer. */if(buffer_dirty(bh)){smp_mb();if(buffer_dirty(bh))return;}if(!test_set_buffer_dirty(bh)){structpage*page=bh->b_page;structaddress_space*mapping=NULL;lock_page_memcg(page);if(!TestSetPageDirty(page)){mapping=page_mapping(page);if(mapping)__set_page_dirty(page,mapping,0);}unlock_page_memcg(page);if(mapping)__mark_inode_dirty(mapping->host,I_DIRTY_PAGES);}}EXPORT_SYMBOL(mark_buffer_dirty);voidmark_buffer_write_io_error(structbuffer_head*bh){set_buffer_write_io_error(bh);/* FIXME: do we need to set this in both places? */if(bh->b_page&&bh->b_page->mapping)mapping_set_error(bh->b_page->mapping,-EIO);if(bh->b_assoc_map)mapping_set_error(bh->b_assoc_map,-EIO);}EXPORT_SYMBOL(mark_buffer_write_io_error);/* * Decrement a buffer_head's reference count. If all buffers against a page * have zero reference count, are clean and unlocked, and if the page is clean * and unlocked then try_to_free_buffers() may strip the buffers from the page * in preparation for freeing it (sometimes, rarely, buffers are removed from * a page but it ends up not being freed, and buffers may later be reattached). */void__brelse(structbuffer_head*buf){if(atomic_read(&buf->b_count)){put_bh(buf);return;}WARN(1,KERN_ERR"VFS: brelse: Trying to free free buffer\n");}EXPORT_SYMBOL(__brelse);/* * bforget() is like brelse(), except it discards any * potentially dirty data. */void__bforget(structbuffer_head*bh){clear_buffer_dirty(bh);if(bh->b_assoc_map){structaddress_space*buffer_mapping=bh->b_page->mapping;spin_lock(&buffer_mapping->private_lock);list_del_init(&bh->b_assoc_buffers);bh->b_assoc_map=NULL;spin_unlock(&buffer_mapping->private_lock);}__brelse(bh);}EXPORT_SYMBOL(__bforget);staticstructbuffer_head*__bread_slow(structbuffer_head*bh){lock_buffer(bh);if(buffer_uptodate(bh)){unlock_buffer(bh);returnbh;}else{get_bh(bh);bh->b_end_io=end_buffer_read_sync;submit_bh(REQ_OP_READ,0,bh);wait_on_buffer(bh);if(buffer_uptodate(bh))returnbh;}brelse(bh);returnNULL;}/* * Per-cpu buffer LRU implementation. To reduce the cost of __find_get_block(). * The bhs[] array is sorted - newest buffer is at bhs[0]. Buffers have their * refcount elevated by one when they're in an LRU. A buffer can only appear * once in a particular CPU's LRU. A single buffer can be present in multiple * CPU's LRUs at the same time. * * This is a transparent caching front-end to sb_bread(), sb_getblk() and * sb_find_get_block(). * * The LRUs themselves only need locking against invalidate_bh_lrus. We use * a local interrupt disable for that. */#define BH_LRU_SIZE 16structbh_lru{structbuffer_head*bhs[BH_LRU_SIZE];};staticDEFINE_PER_CPU(structbh_lru,bh_lrus)={{NULL}};#ifdef CONFIG_SMP#define bh_lru_lock() local_irq_disable()#define bh_lru_unlock() local_irq_enable()#else#define bh_lru_lock() preempt_disable()#define bh_lru_unlock() preempt_enable()#endifstaticinlinevoidcheck_irqs_on(void){#ifdef irqs_disabledBUG_ON(irqs_disabled());#endif}/* * Install a buffer_head into this cpu's LRU. If not already in the LRU, it is * inserted at the front, and the buffer_head at the back if any is evicted. * Or, if already in the LRU it is moved to the front. */staticvoidbh_lru_install(structbuffer_head*bh){structbuffer_head*evictee=bh;structbh_lru*b;inti;check_irqs_on();bh_lru_lock();b=this_cpu_ptr(&bh_lrus);for(i=0;i<BH_LRU_SIZE;i++){swap(evictee,b->bhs[i]);if(evictee==bh){bh_lru_unlock();return;}}get_bh(bh);bh_lru_unlock();brelse(evictee);}/* * Look up the bh in this cpu's LRU. If it's there, move it to the head. */staticstructbuffer_head*lookup_bh_lru(structblock_device*bdev,sector_tblock,unsignedsize){structbuffer_head*ret=NULL;unsignedinti;check_irqs_on();bh_lru_lock();for(i=0;i<BH_LRU_SIZE;i++){structbuffer_head*bh=__this_cpu_read(bh_lrus.bhs[i]);if(bh&&bh->b_blocknr==block&&bh->b_bdev==bdev&&bh->b_size==size){if(i){while(i){__this_cpu_write(bh_lrus.bhs[i],__this_cpu_read(bh_lrus.bhs[i-1]));i--;}__this_cpu_write(bh_lrus.bhs[0],bh);}get_bh(bh);ret=bh;break;}}bh_lru_unlock();returnret;}/* * Perform a pagecache lookup for the matching buffer. If it's there, refresh * it in the LRU and mark it as accessed. If it is not present then return * NULL */structbuffer_head*__find_get_block(structblock_device*bdev,sector_tblock,unsignedsize){structbuffer_head*bh=lookup_bh_lru(bdev,block,size);if(bh==NULL){/* __find_get_block_slow will mark the page accessed */bh=__find_get_block_slow(bdev,block);if(bh)bh_lru_install(bh);}elsetouch_buffer(bh);returnbh;}EXPORT_SYMBOL(__find_get_block);/* * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * * __getblk_gfp() will lock up the machine if grow_dev_page's * try_to_free_buffers() attempt is failing. FIXME, perhaps? */structbuffer_head*__getblk_gfp(structblock_device*bdev,sector_tblock,unsignedsize,gfp_tgfp){structbuffer_head*bh=__find_get_block(bdev,block,size);might_sleep();if(bh==NULL)bh=__getblk_slow(bdev,block,size,gfp);returnbh;}EXPORT_SYMBOL(__getblk_gfp);/* * Do async read-ahead on a buffer.. */void__breadahead(structblock_device*bdev,sector_tblock,unsignedsize){structbuffer_head*bh=__getblk(bdev,block,size);if(likely(bh)){ll_rw_block(REQ_OP_READ,REQ_RAHEAD,1,&bh);brelse(bh);}}EXPORT_SYMBOL(__breadahead);/** * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read * @gfp: page allocation flag * * Reads a specified block, and returns buffer head that contains it. * The page cache can be allocated from non-movable area * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */structbuffer_head*__bread_gfp(structblock_device*bdev,sector_tblock,unsignedsize,gfp_tgfp){structbuffer_head*bh=__getblk_gfp(bdev,block,size,gfp);if(likely(bh)&&!buffer_uptodate(bh))bh=__bread_slow(bh);returnbh;}EXPORT_SYMBOL(__bread_gfp);/* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq * or with preempt disabled. */staticvoidinvalidate_bh_lru(void*arg){structbh_lru*b=&get_cpu_var(bh_lrus);inti;for(i=0;i<BH_LRU_SIZE;i++){brelse(b->bhs[i]);b->bhs[i]=NULL;}put_cpu_var(bh_lrus);}staticboolhas_bh_in_lru(intcpu,void*dummy){structbh_lru*b=per_cpu_ptr(&bh_lrus,cpu);inti;for(i=0;i<BH_LRU_SIZE;i++){if(b->bhs[i])return1;}return0;}voidinvalidate_bh_lrus(void){on_each_cpu_cond(has_bh_in_lru,invalidate_bh_lru,NULL,1,GFP_KERNEL);}EXPORT_SYMBOL_GPL(invalidate_bh_lrus);voidset_bh_page(structbuffer_head*bh,structpage*page,unsignedlongoffset){bh->b_page=page;BUG_ON(offset>=PAGE_SIZE);if(PageHighMem(page))/* * This catches illegal uses and preserves the offset: */bh->b_data=(char*)(0+offset);elsebh->b_data=page_address(page)+offset;}EXPORT_SYMBOL(set_bh_page);/* * Called when truncating a buffer on a page completely. *//* Bits that are cleared during an invalidate */#define BUFFER_FLAGS_DISCARD \ (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ 1 << BH_Delay | 1 << BH_Unwritten)staticvoiddiscard_buffer(structbuffer_head*bh){unsignedlongb_state,b_state_old;lock_buffer(bh);clear_buffer_dirty(bh);bh->b_bdev=NULL;b_state=bh->b_state;for(;;){b_state_old=cmpxchg(&bh->b_state,b_state,(b_state&~BUFFER_FLAGS_DISCARD));if(b_state_old==b_state)break;b_state=b_state_old;}unlock_buffer(bh);}/** * block_invalidatepage - invalidate part or all of a buffer-backed page * * @page: the page which is affected * @offset: start of the range to invalidate * @length: length of the range to invalidate * * block_invalidatepage() is called when all or part of the page has become * invalidated by a truncate operation. * * block_invalidatepage() does not have to release all buffers, but it must * ensure that no dirty buffer is left outside @offset and that no I/O * is underway against any of the blocks which are outside the truncation * point. Because the caller is about to free (and possibly reuse) those * blocks on-disk. */voidblock_invalidatepage(structpage*page,unsignedintoffset,unsignedintlength){structbuffer_head*head,*bh,*next;unsignedintcurr_off=0;unsignedintstop=length+offset;BUG_ON(!PageLocked(page));if(!page_has_buffers(page))gotoout;/* * Check for overflow */BUG_ON(stop>PAGE_SIZE||stop<length);head=page_buffers(page);bh=head;do{unsignedintnext_off=curr_off+bh->b_size;next=bh->b_this_page;/* * Are we still fully in range ? */if(next_off>stop)gotoout;/* * is this block fully invalidated? */if(offset<=curr_off)discard_buffer(bh);curr_off=next_off;bh=next;}while(bh!=head);/* * We release buffers only if the entire page is being invalidated. * The get_block cached value has been unconditionally invalidated, * so real IO is not possible anymore. */if(length==PAGE_SIZE)try_to_release_page(page,0);out:return;}EXPORT_SYMBOL(block_invalidatepage);/* * We attach and possibly dirty the buffers atomically wrt * __set_page_dirty_buffers() via private_lock. try_to_free_buffers * is already excluded via the page lock. */voidcreate_empty_buffers(structpage*page,unsignedlongblocksize,unsignedlongb_state){structbuffer_head*bh,*head,*tail;head=alloc_page_buffers(page,blocksize,true);bh=head;do{bh->b_state|=b_state;tail=bh;bh=bh->b_this_page;}while(bh);tail->b_this_page=head;spin_lock(&page->mapping->private_lock);if(PageUptodate(page)||PageDirty(page)){bh=head;do{if(PageDirty(page))set_buffer_dirty(bh);if(PageUptodate(page))set_buffer_uptodate(bh);bh=bh->b_this_page;}while(bh!=head);}attach_page_buffers(page,head);spin_unlock(&page->mapping->private_lock);}EXPORT_SYMBOL(create_empty_buffers);/** * clean_bdev_aliases: clean a range of buffers in block device * @bdev: Block device to clean buffers in * @block: Start of a range of blocks to clean * @len: Number of blocks to clean * * We are taking a range of blocks for data and we don't want writeback of any * buffer-cache aliases starting from return from this function and until the * moment when something will explicitly mark the buffer dirty (hopefully that * will not happen until we will free that block ;-) We don't even need to mark * it not-uptodate - nobody can expect anything from a newly allocated buffer * anyway. We used to use unmap_buffer() for such invalidation, but that was * wrong. We definitely don't want to mark the alias unmapped, for example - it * would confuse anyone who might pick it with bread() afterwards... * * Also.. Note that bforget() doesn't lock the buffer. So there can be * writeout I/O going on against recently-freed buffers. We don't wait on that * I/O in bforget() - it's more efficient to wait on the I/O only if we really * need to. That happens here. */voidclean_bdev_aliases(structblock_device*bdev,sector_tblock,sector_tlen){structinode*bd_inode=bdev->bd_inode;structaddress_space*bd_mapping=bd_inode->i_mapping;structpagevecpvec;pgoff_tindex=block>>(PAGE_SHIFT-bd_inode->i_blkbits);pgoff_tend;inti,count;structbuffer_head*bh;structbuffer_head*head;end=(block+len-1)>>(PAGE_SHIFT-bd_inode->i_blkbits);pagevec_init(&pvec);while(pagevec_lookup_range(&pvec,bd_mapping,&index,end)){count=pagevec_count(&pvec);for(i=0;i<count;i++){structpage*page=pvec.pages[i];if(!page_has_buffers(page))continue;/* * We use page lock instead of bd_mapping->private_lock * to pin buffers here since we can afford to sleep and * it scales better than a global spinlock lock. */lock_page(page);/* Recheck when the page is locked which pins bhs */if(!page_has_buffers(page))gotounlock_page;head=page_buffers(page);bh=head;do{if(!buffer_mapped(bh)||(bh->b_blocknr<block))gotonext;if(bh->b_blocknr>=block+len)break;clear_buffer_dirty(bh);wait_on_buffer(bh);clear_buffer_req(bh);next:bh=bh->b_this_page;}while(bh!=head);unlock_page:unlock_page(page);}pagevec_release(&pvec);cond_resched();/* End of range already reached? */if(index>end||!index)break;}}EXPORT_SYMBOL(clean_bdev_aliases);/* * Size is a power-of-two in the range 512..PAGE_SIZE, * and the case we care about most is PAGE_SIZE. * * So this *could* possibly be written with those * constraints in mind (relevant mostly if some * architecture has a slow bit-scan instruction) */staticinlineintblock_size_bits(unsignedintblocksize){returnilog2(blocksize);}staticstructbuffer_head*create_page_buffers(structpage*page,structinode*inode,unsignedintb_state){BUG_ON(!PageLocked(page));if(!page_has_buffers(page))create_empty_buffers(page,1<<READ_ONCE(inode->i_blkbits),b_state);returnpage_buffers(page);}/* * NOTE! All mapped/uptodate combinations are valid: * * Mapped Uptodate Meaning * * No No "unknown" - must do get_block() * No Yes "hole" - zero-filled * Yes No "allocated" - allocated on disk, not read in * Yes Yes "valid" - allocated and up-to-date in memory. * * "Dirty" is valid only with the last case (mapped+uptodate). *//* * While block_write_full_page is writing back the dirty buffers under * the page lock, whoever dirtied the buffers may decide to clean them * again at any time. We handle that by only looking at the buffer * state inside lock_buffer(). * * If block_write_full_page() is called for regular writeback * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a * locked buffer. This only can happen if someone has written the buffer * directly, with submit_bh(). At the address_space level PageWriteback * prevents this contention from occurring. * * If block_write_full_page() is called with wbc->sync_mode == * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this * causes the writes to be flagged as synchronous writes. */int__block_write_full_page(structinode*inode,structpage*page,get_block_t*get_block,structwriteback_control*wbc,bh_end_io_t*handler){interr;sector_tblock;sector_tlast_block;structbuffer_head*bh,*head;unsignedintblocksize,bbits;intnr_underway=0;intwrite_flags=wbc_to_write_flags(wbc);head=create_page_buffers(page,inode,(1<<BH_Dirty)|(1<<BH_Uptodate));/* * Be very careful. We have no exclusion from __set_page_dirty_buffers * here, and the (potentially unmapped) buffers may become dirty at * any time. If a buffer becomes dirty here after we've inspected it * then we just miss that fact, and the page stays dirty. * * Buffers outside i_size may be dirtied by __set_page_dirty_buffers; * handle that here by just cleaning them. */bh=head;blocksize=bh->b_size;bbits=block_size_bits(blocksize);block=(sector_t)page->index<<(PAGE_SHIFT-bbits);last_block=(i_size_read(inode)-1)>>bbits;/* * Get all the dirty buffers mapped to disk addresses and * handle any aliases from the underlying blockdev's mapping. */do{if(block>last_block){/* * mapped buffers outside i_size will occur, because * this page can be outside i_size when there is a * truncate in progress. *//* * The buffer was zeroed by block_write_full_page() */clear_buffer_dirty(bh);set_buffer_uptodate(bh);}elseif((!buffer_mapped(bh)||buffer_delay(bh))&&buffer_dirty(bh)){WARN_ON(bh->b_size!=blocksize);err=get_block(inode,block,bh,1);if(err)gotorecover;clear_buffer_delay(bh);if(buffer_new(bh)){/* blockdev mappings never come here */clear_buffer_new(bh);clean_bdev_bh_alias(bh);}}bh=bh->b_this_page;block++;}while(bh!=head);do{if(!buffer_mapped(bh))continue;/* * If it's a fully non-blocking write attempt and we cannot * lock the buffer then redirty the page. Note that this can * potentially cause a busy-wait loop from writeback threads * and kswapd activity, but those code paths have their own * higher-level throttling. */if(wbc->sync_mode!=WB_SYNC_NONE){lock_buffer(bh);}elseif(!trylock_buffer(bh)){redirty_page_for_writepage(wbc,page);continue;}if(test_clear_buffer_dirty(bh)){mark_buffer_async_write_endio(bh,handler);}else{unlock_buffer(bh);}}while((bh=bh->b_this_page)!=head);/* * The page and its buffers are protected by PageWriteback(), so we can * drop the bh refcounts early. */BUG_ON(PageWriteback(page));set_page_writeback(page);do{structbuffer_head*next=bh->b_this_page;if(buffer_async_write(bh)){submit_bh_wbc(REQ_OP_WRITE,write_flags,bh,inode->i_write_hint,wbc);nr_underway++;}bh=next;}while(bh!=head);unlock_page(page);err=0;done:if(nr_underway==0){/* * The page was marked dirty, but the buffers were * clean. Someone wrote them back by hand with * ll_rw_block/submit_bh. A rare case. */end_page_writeback(page);/* * The page and buffer_heads can be released at any time from * here on. */}returnerr;recover:/* * ENOSPC, or some other error. We may already have added some * blocks to the file, so we need to write these out to avoid * exposing stale data. * The page is currently locked and not marked for writeback */bh=head;/* Recovery: lock and submit the mapped buffers */do{if(buffer_mapped(bh)&&buffer_dirty(bh)&&!buffer_delay(bh)){lock_buffer(bh);mark_buffer_async_write_endio(bh,handler);}else{/* * The buffer may have been set dirty during * attachment to a dirty page. */clear_buffer_dirty(bh);}}while((bh=bh->b_this_page)!=head);SetPageError(page);BUG_ON(PageWriteback(page));mapping_set_error(page->mapping,err);set_page_writeback(page);do{structbuffer_head*next=bh->b_this_page;if(buffer_async_write(bh)){clear_buffer_dirty(bh);submit_bh_wbc(REQ_OP_WRITE,write_flags,bh,inode->i_write_hint,wbc);nr_underway++;}bh=next;}while(bh!=head);unlock_page(page);gotodone;}EXPORT_SYMBOL(__block_write_full_page);/* * If a page has any new buffers, zero them out here, and mark them uptodate * and dirty so they'll be written out (in order to prevent uninitialised * block data from leaking). And clear the new bit. */voidpage_zero_new_buffers(structpage*page,unsignedfrom,unsignedto){unsignedintblock_start,block_end;structbuffer_head*head,*bh;BUG_ON(!PageLocked(page));if(!page_has_buffers(page))return;bh=head=page_buffers(page);block_start=0;do{block_end=block_start+bh->b_size;if(buffer_new(bh)){if(block_end>from&&block_start<to){if(!PageUptodate(page)){unsignedstart,size;start=max(from,block_start);size=min(to,block_end)-start;zero_user(page,start,size);set_buffer_uptodate(bh);}clear_buffer_new(bh);mark_buffer_dirty(bh);}}block_start=block_end;bh=bh->b_this_page;}while(bh!=head);}EXPORT_SYMBOL(page_zero_new_buffers);staticvoidiomap_to_bh(structinode*inode,sector_tblock,structbuffer_head*bh,structiomap*iomap){loff_toffset=block<<inode->i_blkbits;bh->b_bdev=iomap->bdev;/* * Block points to offset in file we need to map, iomap contains * the offset at which the map starts. If the map ends before the * current block, then do not map the buffer and let the caller * handle it. */BUG_ON(offset>=iomap->offset+iomap->length);switch(iomap->type){caseIOMAP_HOLE:/* * If the buffer is not up to date or beyond the current EOF, * we need to mark it as new to ensure sub-block zeroing is * executed if necessary. */if(!buffer_uptodate(bh)||(offset>=i_size_read(inode)))set_buffer_new(bh);break;caseIOMAP_DELALLOC:if(!buffer_uptodate(bh)||(offset>=i_size_read(inode)))set_buffer_new(bh);set_buffer_uptodate(bh);set_buffer_mapped(bh);set_buffer_delay(bh);break;caseIOMAP_UNWRITTEN:/* * For unwritten regions, we always need to ensure that * sub-block writes cause the regions in the block we are not * writing to are zeroed. Set the buffer as new to ensure this. */set_buffer_new(bh);set_buffer_unwritten(bh);/* FALLTHRU */caseIOMAP_MAPPED:if(offset>=i_size_read(inode))set_buffer_new(bh);bh->b_blocknr=(iomap->addr+offset-iomap->offset)>>inode->i_blkbits;set_buffer_mapped(bh);break;}}int__block_write_begin_int(structpage*page,loff_tpos,unsignedlen,get_block_t*get_block,structiomap*iomap){unsignedfrom=pos&(PAGE_SIZE-1);unsignedto=from+len;structinode*inode=page->mapping->host;unsignedblock_start,block_end;sector_tblock;interr=0;unsignedblocksize,bbits;structbuffer_head*bh,*head,*wait[2],**wait_bh=wait;BUG_ON(!PageLocked(page));BUG_ON(from>PAGE_SIZE);BUG_ON(to>PAGE_SIZE);BUG_ON(from>to);head=create_page_buffers(page,inode,0);blocksize=head->b_size;bbits=block_size_bits(blocksize);block=(sector_t)page->index<<(PAGE_SHIFT-bbits);for(bh=head,block_start=0;bh!=head||!block_start;block++,block_start=block_end,bh=bh->b_this_page){block_end=block_start+blocksize;if(block_end<=from||block_start>=to){if(PageUptodate(page)){if(!buffer_uptodate(bh))set_buffer_uptodate(bh);}continue;}if(buffer_new(bh))clear_buffer_new(bh);if(!buffer_mapped(bh)){WARN_ON(bh->b_size!=blocksize);if(get_block){err=get_block(inode,block,bh,1);if(err)break;}else{iomap_to_bh(inode,block,bh,iomap);}if(buffer_new(bh)){clean_bdev_bh_alias(bh);if(PageUptodate(page)){clear_buffer_new(bh);set_buffer_uptodate(bh);mark_buffer_dirty(bh);continue;}if(block_end>to||block_start<from)zero_user_segments(page,to,block_end,block_start,from);continue;}}if(PageUptodate(page)){if(!buffer_uptodate(bh))set_buffer_uptodate(bh);continue;}if(!buffer_uptodate(bh)&&!buffer_delay(bh)&&!buffer_unwritten(bh)&&(block_start<from||block_end>to)){ll_rw_block(REQ_OP_READ,0,1,&bh);*wait_bh++=bh;}}/* * If we issued read requests - let them complete. */while(wait_bh>wait){wait_on_buffer(*--wait_bh);if(!buffer_uptodate(*wait_bh))err=-EIO;}if(unlikely(err))page_zero_new_buffers(page,from,to);returnerr;}int__block_write_begin(structpage*page,loff_tpos,unsignedlen,get_block_t*get_block){return__block_write_begin_int(page,pos,len,get_block,NULL);}EXPORT_SYMBOL(__block_write_begin);staticint__block_commit_write(structinode*inode,structpage*page,unsignedfrom,unsignedto){unsignedblock_start,block_end;intpartial=0;unsignedblocksize;structbuffer_head*bh,*head;bh=head=page_buffers(page);blocksize=bh->b_size;block_start=0;do{block_end=block_start+blocksize;if(block_end<=from||block_start>=to){if(!buffer_uptodate(bh))partial=1;}else{set_buffer_uptodate(bh);mark_buffer_dirty(bh);}clear_buffer_new(bh);block_start=block_end;bh=bh->b_this_page;}while(bh!=head);/* * If this is a partial write which happened to make all buffers * uptodate then we can optimize away a bogus readpage() for * the next read(). Here we 'discover' whether the page went * uptodate as a result of this (potentially partial) write. */if(!partial)SetPageUptodate(page);return0;}/* * block_write_begin takes care of the basic task of block allocation and * bringing partial write blocks uptodate first. * * The filesystem needs to handle block truncation upon failure. */intblock_write_begin(structaddress_space*mapping,loff_tpos,unsignedlen,unsignedflags,structpage**pagep,get_block_t*get_block){pgoff_tindex=pos>>PAGE_SHIFT;structpage*page;intstatus;page=grab_cache_page_write_begin(mapping,index,flags);if(!page)return-ENOMEM;status=__block_write_begin(page,pos,len,get_block);if(unlikely(status)){unlock_page(page);put_page(page);page=NULL;}*pagep=page;returnstatus;}EXPORT_SYMBOL(block_write_begin);intblock_write_end(structfile*file,structaddress_space*mapping,loff_tpos,unsignedlen,unsignedcopied,structpage*page,void*fsdata){structinode*inode=mapping->host;unsignedstart;start=pos&(PAGE_SIZE-1);if(unlikely(copied<len)){/* * The buffers that were written will now be uptodate, so we * don't have to worry about a readpage reading them and * overwriting a partial write. However if we have encountered * a short write and only partially written into a buffer, it * will not be marked uptodate, so a readpage might come in and * destroy our partial write. * * Do the simplest thing, and just treat any short write to a * non uptodate page as a zero-length write, and force the * caller to redo the whole thing. */if(!PageUptodate(page))copied=0;page_zero_new_buffers(page,start+copied,start+len);}flush_dcache_page(page);/* This could be a short (even 0-length) commit */__block_commit_write(inode,page,start,start+copied);returncopied;}EXPORT_SYMBOL(block_write_end);intgeneric_write_end(structfile*file,structaddress_space*mapping,loff_tpos,unsignedlen,unsignedcopied,structpage*page,void*fsdata){structinode*inode=mapping->host;loff_told_size=inode->i_size;inti_size_changed=0;copied=block_write_end(file,mapping,pos,len,copied,page,fsdata);/* * No need to use i_size_read() here, the i_size * cannot change under us because we hold i_mutex. * * But it's important to update i_size while still holding page lock: * page writeout could otherwise come in and zero beyond i_size. */if(pos+copied>inode->i_size){i_size_write(inode,pos+copied);i_size_changed=1;}unlock_page(page);put_page(page);if(old_size<pos)pagecache_isize_extended(inode,old_size,pos);/* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock * ordering of page lock and transaction start for journaling * filesystems. */if(i_size_changed)mark_inode_dirty(inode);returncopied;}EXPORT_SYMBOL(generic_write_end);/* * block_is_partially_uptodate checks whether buffers within a page are * uptodate or not. * * Returns true if all buffers which correspond to a file portion * we want to read are uptodate. */intblock_is_partially_uptodate(structpage*page,unsignedlongfrom,unsignedlongcount){unsignedblock_start,block_end,blocksize;unsignedto;structbuffer_head*bh,*head;intret=1;if(!page_has_buffers(page))return0;head=page_buffers(page);blocksize=head->b_size;to=min_t(unsigned,PAGE_SIZE-from,count);to=from+to;if(from<blocksize&&to>PAGE_SIZE-blocksize)return0;bh=head;block_start=0;do{block_end=block_start+blocksize;if(block_end>from&&block_start<to){if(!buffer_uptodate(bh)){ret=0;break;}if(block_end>=to)break;}block_start=block_end;bh=bh->b_this_page;}while(bh!=head);returnret;}EXPORT_SYMBOL(block_is_partially_uptodate);/* * Generic "read page" function for block devices that have the normal * get_block functionality. This is most of the block device filesystems. * Reads the page asynchronously --- the unlock_buffer() and * set/clear_buffer_uptodate() functions propagate buffer state into the * page struct once IO has completed. */intblock_read_full_page(structpage*page,get_block_t*get_block){structinode*inode=page->mapping->host;sector_tiblock,lblock;structbuffer_head*bh,*head,*arr[MAX_BUF_PER_PAGE];unsignedintblocksize,bbits;intnr,i;intfully_mapped=1;head=create_page_buffers(page,inode,0);blocksize=head->b_size;bbits=block_size_bits(blocksize);iblock=(sector_t)page->index<<(PAGE_SHIFT-bbits);lblock=(i_size_read(inode)+blocksize-1)>>bbits;bh=head;nr=0;i=0;do{if(buffer_uptodate(bh))continue;if(!buffer_mapped(bh)){interr=0;fully_mapped=0;if(iblock<lblock){WARN_ON(bh->b_size!=blocksize);err=get_block(inode,iblock,bh,0);if(err)SetPageError(page);}if(!buffer_mapped(bh)){zero_user(page,i*blocksize,blocksize);if(!err)set_buffer_uptodate(bh);continue;}/* * get_block() might have updated the buffer * synchronously */if(buffer_uptodate(bh))continue;}arr[nr++]=bh;}while(i++,iblock++,(bh=bh->b_this_page)!=head);if(fully_mapped)SetPageMappedToDisk(page);if(!nr){/* * All buffers are uptodate - we can set the page uptodate * as well. But not if get_block() returned an error. */if(!PageError(page))SetPageUptodate(page);unlock_page(page);return0;}/* Stage two: lock the buffers */for(i=0;i<nr;i++){bh=arr[i];lock_buffer(bh);mark_buffer_async_read(bh);}/* * Stage 3: start the IO. Check for uptodateness * inside the buffer lock in case another process reading * the underlying blockdev brought it uptodate (the sct fix). */for(i=0;i<nr;i++){bh=arr[i];if(buffer_uptodate(bh))end_buffer_async_read(bh,1);elsesubmit_bh(REQ_OP_READ,0,bh);}return0;}EXPORT_SYMBOL(block_read_full_page);/* utility function for filesystems that need to do work on expanding * truncates. Uses filesystem pagecache writes to allow the filesystem to * deal with the hole. */intgeneric_cont_expand_simple(structinode*inode,loff_tsize){structaddress_space*mapping=inode->i_mapping;structpage*page;void*fsdata;interr;err=inode_newsize_ok(inode,size);if(err)gotoout;err=pagecache_write_begin(NULL,mapping,size,0,AOP_FLAG_CONT_EXPAND,&page,&fsdata);if(err)gotoout;err=pagecache_write_end(NULL,mapping,size,0,0,page,fsdata);BUG_ON(err>0);out:returnerr;}EXPORT_SYMBOL(generic_cont_expand_simple);staticintcont_expand_zero(structfile*file,structaddress_space*mapping,loff_tpos,loff_t*bytes){structinode*inode=mapping->host;unsignedintblocksize=i_blocksize(inode);structpage*page;void*fsdata;pgoff_tindex,curidx;loff_tcurpos;unsignedzerofrom,offset,len;interr=0;index=pos>>PAGE_SHIFT;offset=pos&~PAGE_MASK;while(index>(curidx=(curpos=*bytes)>>PAGE_SHIFT)){zerofrom=curpos&~PAGE_MASK;if(zerofrom&(blocksize-1)){*bytes|=(blocksize-1);(*bytes)++;}len=PAGE_SIZE-zerofrom;err=pagecache_write_begin(file,mapping,curpos,len,0,&page,&fsdata);if(err)gotoout;zero_user(page,zerofrom,len);err=pagecache_write_end(file,mapping,curpos,len,len,page,fsdata);if(err<0)gotoout;BUG_ON(err!=len);err=0;balance_dirty_pages_ratelimited(mapping);if(unlikely(fatal_signal_pending(current))){err=-EINTR;gotoout;}}/* page covers the boundary, find the boundary offset */if(index==curidx){zerofrom=curpos&~PAGE_MASK;/* if we will expand the thing last block will be filled */if(offset<=zerofrom){gotoout;}if(zerofrom&(blocksize-1)){*bytes|=(blocksize-1);(*bytes)++;}len=offset-zerofrom;err=pagecache_write_begin(file,mapping,curpos,len,0,&page,&fsdata);if(err)gotoout;zero_user(page,zerofrom,len);err=pagecache_write_end(file,mapping,curpos,len,len,page,fsdata);if(err<0)gotoout;BUG_ON(err!=len);err=0;}out:returnerr;}/* * For moronic filesystems that do not allow holes in file. * We may have to extend the file. */intcont_write_begin(structfile*file,structaddress_space*mapping,loff_tpos,unsignedlen,unsignedflags,structpage**pagep,void**fsdata,get_block_t*get_block,loff_t*bytes){structinode*inode=mapping->host;unsignedintblocksize=i_blocksize(inode);unsignedintzerofrom;interr;err=cont_expand_zero(file,mapping,pos,bytes);if(err)returnerr;zerofrom=*bytes&~PAGE_MASK;if(pos+len>*bytes&&zerofrom&(blocksize-1)){*bytes|=(blocksize-1);(*bytes)++;}returnblock_write_begin(mapping,pos,len,flags,pagep,get_block);}EXPORT_SYMBOL(cont_write_begin);intblock_commit_write(structpage*page,unsignedfrom,unsignedto){structinode*inode=page->mapping->host;__block_commit_write(inode,page,from,to);return0;}EXPORT_SYMBOL(block_commit_write);/* * block_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must * be careful to check for EOF conditions here. We set the page up correctly * for a written page which means we get ENOSPC checking when writing into * holes and correct delalloc and unwritten extent mapping on filesystems that * support these features. * * We are not allowed to take the i_mutex here so we have to play games to * protect against truncate races as the page could now be beyond EOF. Because * truncate writes the inode size before removing pages, once we have the * page lock we can determine safely if the page is beyond EOF. If it is not * beyond EOF, then the page is guaranteed safe against truncation until we * unlock the page. * * Direct callers of this function should protect against filesystem freezing * using sb_start_pagefault() - sb_end_pagefault() functions. */intblock_page_mkwrite(structvm_area_struct*vma,structvm_fault*vmf,get_block_tget_block){structpage*page=vmf->page;structinode*inode=file_inode(vma->vm_file);unsignedlongend;loff_tsize;intret;lock_page(page);size=i_size_read(inode);if((page->mapping!=inode->i_mapping)||(page_offset(page)>size)){/* We overload EFAULT to mean page got truncated */ret=-EFAULT;gotoout_unlock;}/* page is wholly or partially inside EOF */if(((page->index+1)<<PAGE_SHIFT)>size)end=size&~PAGE_MASK;elseend=PAGE_SIZE;ret=__block_write_begin(page,0,end,get_block);if(!ret)ret=block_commit_write(page,0,end);if(unlikely(ret<0))gotoout_unlock;set_page_dirty(page);wait_for_stable_page(page);return0;out_unlock:unlock_page(page);returnret;}EXPORT_SYMBOL(block_page_mkwrite);/* * nobh_write_begin()'s prereads are special: the buffer_heads are freed * immediately, while under the page lock. So it needs a special end_io * handler which does not touch the bh after unlocking it. */staticvoidend_buffer_read_nobh(structbuffer_head*bh,intuptodate){__end_buffer_read_notouch(bh,uptodate);}/* * Attach the singly-linked list of buffers created by nobh_write_begin, to * the page (converting it to circular linked list and taking care of page * dirty races). */staticvoidattach_nobh_buffers(structpage*page,structbuffer_head*head){structbuffer_head*bh;BUG_ON(!PageLocked(page));spin_lock(&page->mapping->private_lock);bh=head;do{if(PageDirty(page))set_buffer_dirty(bh);if(!bh->b_this_page)bh->b_this_page=head;bh=bh->b_this_page;}while(bh!=head);attach_page_buffers(page,head);spin_unlock(&page->mapping->private_lock);}/* * On entry, the page is fully not uptodate. * On exit the page is fully uptodate in the areas outside (from,to) * The filesystem needs to handle block truncation upon failure. */intnobh_write_begin(structaddress_space*mapping,loff_tpos,unsignedlen,unsignedflags,structpage**pagep,void**fsdata,get_block_t*get_block){structinode*inode=mapping->host;constunsignedblkbits=inode->i_blkbits;constunsignedblocksize=1<<blkbits;structbuffer_head*head,*bh;structpage*page;pgoff_tindex;unsignedfrom,to;unsignedblock_in_page;unsignedblock_start,block_end;sector_tblock_in_file;intnr_reads=0;intret=0;intis_mapped_to_disk=1;index=pos>>PAGE_SHIFT;from=pos&(PAGE_SIZE-1);to=from+len;page=grab_cache_page_write_begin(mapping,index,flags);if(!page)return-ENOMEM;*pagep=page;*fsdata=NULL;if(page_has_buffers(page)){ret=__block_write_begin(page,pos,len,get_block);if(unlikely(ret))gotoout_release;returnret;}if(PageMappedToDisk(page))return0;/* * Allocate buffers so that we can keep track of state, and potentially * attach them to the page if an error occurs. In the common case of * no error, they will just be freed again without ever being attached * to the page (which is all OK, because we're under the page lock). * * Be careful: the buffer linked list is a NULL terminated one, rather * than the circular one we're used to. */head=alloc_page_buffers(page,blocksize,false);if(!head){ret=-ENOMEM;gotoout_release;}block_in_file=(sector_t)page->index<<(PAGE_SHIFT-blkbits);/* * We loop across all blocks in the page, whether or not they are * part of the affected region. This is so we can discover if the * page is fully mapped-to-disk. */for(block_start=0,block_in_page=0,bh=head;block_start<PAGE_SIZE;block_in_page++,block_start+=blocksize,bh=bh->b_this_page){intcreate;block_end=block_start+blocksize;bh->b_state=0;create=1;if(block_start>=to)create=0;ret=get_block(inode,block_in_file+block_in_page,bh,create);if(ret)gotofailed;if(!buffer_mapped(bh))is_mapped_to_disk=0;if(buffer_new(bh))clean_bdev_bh_alias(bh);if(PageUptodate(page)){set_buffer_uptodate(bh);continue;}if(buffer_new(bh)||!buffer_mapped(bh)){zero_user_segments(page,block_start,from,to,block_end);continue;}if(buffer_uptodate(bh))continue;/* reiserfs does this */if(block_start<from||block_end>to){lock_buffer(bh);bh->b_end_io=end_buffer_read_nobh;submit_bh(REQ_OP_READ,0,bh);nr_reads++;}}if(nr_reads){/* * The page is locked, so these buffers are protected from * any VM or truncate activity. Hence we don't need to care * for the buffer_head refcounts. */for(bh=head;bh;bh=bh->b_this_page){wait_on_buffer(bh);if(!buffer_uptodate(bh))ret=-EIO;}if(ret)gotofailed;}if(is_mapped_to_disk)SetPageMappedToDisk(page);*fsdata=head;/* to be released by nobh_write_end */return0;failed:BUG_ON(!ret);/* * Error recovery is a bit difficult. We need to zero out blocks that * were newly allocated, and dirty them to ensure they get written out. * Buffers need to be attached to the page at this point, otherwise * the handling of potential IO errors during writeout would be hard * (could try doing synchronous writeout, but what if that fails too?) */attach_nobh_buffers(page,head);page_zero_new_buffers(page,from,to);out_release:unlock_page(page);put_page(page);*pagep=NULL;returnret;}EXPORT_SYMBOL(nobh_write_begin);intnobh_write_end(structfile*file,structaddress_space*mapping,loff_tpos,unsignedlen,unsignedcopied,structpage*page,void*fsdata){structinode*inode=page->mapping->host;structbuffer_head*head=fsdata;structbuffer_head*bh;BUG_ON(fsdata!=NULL&&page_has_buffers(page));if(unlikely(copied<len)&&head)attach_nobh_buffers(page,head);if(page_has_buffers(page))returngeneric_write_end(file,mapping,pos,len,copied,page,fsdata);SetPageUptodate(page);set_page_dirty(page);if(pos+copied>inode->i_size){i_size_write(inode,pos+copied);mark_inode_dirty(inode);}unlock_page(page);put_page(page);while(head){bh=head;head=head->b_this_page;free_buffer_head(bh);}returncopied;}EXPORT_SYMBOL(nobh_write_end);/* * nobh_writepage() - based on block_full_write_page() except * that it tries to operate without attaching bufferheads to * the page. */intnobh_writepage(structpage*page,get_block_t*get_block,structwriteback_control*wbc){structinode*constinode=page->mapping->host;loff_ti_size=i_size_read(inode);constpgoff_tend_index=i_size>>PAGE_SHIFT;unsignedoffset;intret;/* Is the page fully inside i_size? */if(page->index<end_index)gotoout;/* Is the page fully outside i_size? (truncate in progress) */offset=i_size&(PAGE_SIZE-1);if(page->index>=end_index+1||!offset){/* * The page may have dirty, unmapped buffers. For example, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */#if 0 /* Not really sure about this - do we need this ? */ if (page->mapping->a_ops->invalidatepage) page->mapping->a_ops->invalidatepage(page, offset);#endifunlock_page(page);return0;/* don't care */}/* * The page straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */zero_user_segment(page,offset,PAGE_SIZE);out:ret=mpage_writepage(page,get_block,wbc);if(ret==-EAGAIN)ret=__block_write_full_page(inode,page,get_block,wbc,end_buffer_async_write);returnret;}EXPORT_SYMBOL(nobh_writepage);intnobh_truncate_page(structaddress_space*mapping,loff_tfrom,get_block_t*get_block){pgoff_tindex=from>>PAGE_SHIFT;unsignedoffset=from&(PAGE_SIZE-1);unsignedblocksize;sector_tiblock;unsignedlength,pos;structinode*inode=mapping->host;structpage*page;structbuffer_headmap_bh;interr;blocksize=i_blocksize(inode);length=offset&(blocksize-1);/* Block boundary? Nothing to do */if(!length)return0;length=blocksize-length;iblock=(sector_t)index<<(PAGE_SHIFT-inode->i_blkbits);page=grab_cache_page(mapping,index);err=-ENOMEM;if(!page)gotoout;if(page_has_buffers(page)){has_buffers:unlock_page(page);put_page(page);returnblock_truncate_page(mapping,from,get_block);}/* Find the buffer that contains "offset" */pos=blocksize;while(offset>=pos){iblock++;pos+=blocksize;}map_bh.b_size=blocksize;map_bh.b_state=0;err=get_block(inode,iblock,&map_bh,0);if(err)gotounlock;/* unmapped? It's a hole - nothing to do */if(!buffer_mapped(&map_bh))gotounlock;/* Ok, it's mapped. Make sure it's up-to-date */if(!PageUptodate(page)){err=mapping->a_ops->readpage(NULL,page);if(err){put_page(page);gotoout;}lock_page(page);if(!PageUptodate(page)){err=-EIO;gotounlock;}if(page_has_buffers(page))gotohas_buffers;}zero_user(page,offset,length);set_page_dirty(page);err=0;unlock:unlock_page(page);put_page(page);out:returnerr;}EXPORT_SYMBOL(nobh_truncate_page);intblock_truncate_page(structaddress_space*mapping,loff_tfrom,get_block_t*get_block){pgoff_tindex=from>>PAGE_SHIFT;unsignedoffset=from&(PAGE_SIZE-1);unsignedblocksize;sector_tiblock;unsignedlength,pos;structinode*inode=mapping->host;structpage*page;structbuffer_head*bh;interr;blocksize=i_blocksize(inode);length=offset&(blocksize-1);/* Block boundary? Nothing to do */if(!length)return0;length=blocksize-length;iblock=(sector_t)index<<(PAGE_SHIFT-inode->i_blkbits);page=grab_cache_page(mapping,index);err=-ENOMEM;if(!page)gotoout;if(!page_has_buffers(page))create_empty_buffers(page,blocksize,0);/* Find the buffer that contains "offset" */bh=page_buffers(page);pos=blocksize;while(offset>=pos){bh=bh->b_this_page;iblock++;pos+=blocksize;}err=0;if(!buffer_mapped(bh)){WARN_ON(bh->b_size!=blocksize);err=get_block(inode,iblock,bh,0);if(err)gotounlock;/* unmapped? It's a hole - nothing to do */if(!buffer_mapped(bh))gotounlock;}/* Ok, it's mapped. Make sure it's up-to-date */if(PageUptodate(page))set_buffer_uptodate(bh);if(!buffer_uptodate(bh)&&!buffer_delay(bh)&&!buffer_unwritten(bh)){err=-EIO;ll_rw_block(REQ_OP_READ,0,1,&bh);wait_on_buffer(bh);/* Uhhuh. Read error. Complain and punt. */if(!buffer_uptodate(bh))gotounlock;}zero_user(page,offset,length);mark_buffer_dirty(bh);err=0;unlock:unlock_page(page);put_page(page);out:returnerr;}EXPORT_SYMBOL(block_truncate_page);/* * The generic ->writepage function for buffer-backed address_spaces */intblock_write_full_page(structpage*page,get_block_t*get_block,structwriteback_control*wbc){structinode*constinode=page->mapping->host;loff_ti_size=i_size_read(inode);constpgoff_tend_index=i_size>>PAGE_SHIFT;unsignedoffset;/* Is the page fully inside i_size? */if(page->index<end_index)return__block_write_full_page(inode,page,get_block,wbc,end_buffer_async_write);/* Is the page fully outside i_size? (truncate in progress) */offset=i_size&(PAGE_SIZE-1);if(page->index>=end_index+1||!offset){/* * The page may have dirty, unmapped buffers. For example, * they may have been added in ext3_writepage(). Make them * freeable here, so the page does not leak. */do_invalidatepage(page,0,PAGE_SIZE);unlock_page(page);return0;/* don't care */}/* * The page straddles i_size. It must be zeroed out on each and every * writepage invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." */zero_user_segment(page,offset,PAGE_SIZE);return__block_write_full_page(inode,page,get_block,wbc,end_buffer_async_write);}EXPORT_SYMBOL(block_write_full_page);sector_tgeneric_block_bmap(structaddress_space*mapping,sector_tblock,get_block_t*get_block){structinode*inode=mapping->host;structbuffer_headtmp={.b_size=i_blocksize(inode),};get_block(inode,block,&tmp,0);returntmp.b_blocknr;}EXPORT_SYMBOL(generic_block_bmap);staticvoidend_bio_bh_io_sync(structbio*bio){structbuffer_head*bh=bio->bi_private;if(unlikely(bio_flagged(bio,BIO_QUIET)))set_bit(BH_Quiet,&bh->b_state);bh->b_end_io(bh,!bio->bi_status);bio_put(bio);}/* * This allows us to do IO even on the odd last sectors * of a device, even if the block size is some multiple * of the physical sector size. * * We'll just truncate the bio to the size of the device, * and clear the end of the buffer head manually. * * Truly out-of-range accesses will turn into actual IO * errors, this only handles the "we need to be able to * do IO at the final sector" case. */voidguard_bio_eod(intop,structbio*bio){sector_tmaxsector;structbio_vec*bvec=bio_last_bvec_all(bio);unsignedtruncated_bytes;structhd_struct*part;rcu_read_lock();part=__disk_get_part(bio->bi_disk,bio->bi_partno);if(part)maxsector=part_nr_sects_read(part);elsemaxsector=get_capacity(bio->bi_disk);rcu_read_unlock();if(!maxsector)return;/* * If the *whole* IO is past the end of the device, * let it through, and the IO layer will turn it into * an EIO. */if(unlikely(bio->bi_iter.bi_sector>=maxsector))return;maxsector-=bio->bi_iter.bi_sector;if(likely((bio->bi_iter.bi_size>>9)<=maxsector))return;/* Uhhuh. We've got a bio that straddles the device size! */truncated_bytes=bio->bi_iter.bi_size-(maxsector<<9);/* Truncate the bio.. */bio->bi_iter.bi_size-=truncated_bytes;bvec->bv_len-=truncated_bytes;/* ..and clear the end of the buffer for reads */if(op==REQ_OP_READ){zero_user(bvec->bv_page,bvec->bv_offset+bvec->bv_len,truncated_bytes);}}staticintsubmit_bh_wbc(intop,intop_flags,structbuffer_head*bh,enumrw_hintwrite_hint,structwriteback_control*wbc){structbio*bio;BUG_ON(!buffer_locked(bh));BUG_ON(!buffer_mapped(bh));BUG_ON(!bh->b_end_io);BUG_ON(buffer_delay(bh));BUG_ON(buffer_unwritten(bh));/* * Only clear out a write error when rewriting */if(test_set_buffer_req(bh)&&(op==REQ_OP_WRITE))clear_buffer_write_io_error(bh);/* * from here on down, it's all bio -- do the initial mapping, * submit_bio -> generic_make_request may further map this bio around */bio=bio_alloc(GFP_NOIO,1);if(wbc){wbc_init_bio(wbc,bio);wbc_account_io(wbc,bh->b_page,bh->b_size);}bio->bi_iter.bi_sector=bh->b_blocknr*(bh->b_size>>9);bio_set_dev(bio,bh->b_bdev);bio->bi_write_hint=write_hint;bio_add_page(bio,bh->b_page,bh->b_size,bh_offset(bh));BUG_ON(bio->bi_iter.bi_size!=bh->b_size);bio->bi_end_io=end_bio_bh_io_sync;bio->bi_private=bh;/* Take care of bh's that straddle the end of the device */guard_bio_eod(op,bio);if(buffer_meta(bh))op_flags|=REQ_META;if(buffer_prio(bh))op_flags|=REQ_PRIO;bio_set_op_attrs(bio,op,op_flags);submit_bio(bio);return0;}intsubmit_bh(intop,intop_flags,structbuffer_head*bh){returnsubmit_bh_wbc(op,op_flags,bh,0,NULL);}EXPORT_SYMBOL(submit_bh);/** * ll_rw_block: low-level access to block devices (DEPRECATED) * @op: whether to %READ or %WRITE * @op_flags: req_flag_bits * @nr: number of &struct buffer_heads in the array * @bhs: array of pointers to &struct buffer_head * * ll_rw_block() takes an array of pointers to &struct buffer_heads, and * requests an I/O operation on them, either a %REQ_OP_READ or a %REQ_OP_WRITE. * @op_flags contains flags modifying the detailed I/O behavior, most notably * %REQ_RAHEAD. * * This function drops any buffer that it cannot get a lock on (with the * BH_Lock state bit), any buffer that appears to be clean when doing a write * request, and any buffer that appears to be up-to-date when doing read * request. Further it marks as clean buffers that are processed for * writing (the buffer cache won't assume that they are actually clean * until the buffer gets unlocked). * * ll_rw_block sets b_end_io to simple completion handler that marks * the buffer up-to-date (if appropriate), unlocks the buffer and wakes * any waiters. * * All of the buffers must be for the same device, and must also be a * multiple of the current approved size for the device. */voidll_rw_block(intop,intop_flags,intnr,structbuffer_head*bhs[]){inti;for(i=0;i<nr;i++){structbuffer_head*bh=bhs[i];if(!trylock_buffer(bh))continue;if(op==WRITE){if(test_clear_buffer_dirty(bh)){bh->b_end_io=end_buffer_write_sync;get_bh(bh);submit_bh(op,op_flags,bh);continue;}}else{if(!buffer_uptodate(bh)){bh->b_end_io=end_buffer_read_sync;get_bh(bh);submit_bh(op,op_flags,bh);continue;}}unlock_buffer(bh);}}EXPORT_SYMBOL(ll_rw_block);voidwrite_dirty_buffer(structbuffer_head*bh,intop_flags){lock_buffer(bh);if(!test_clear_buffer_dirty(bh)){unlock_buffer(bh);return;}bh->b_end_io=end_buffer_write_sync;get_bh(bh);submit_bh(REQ_OP_WRITE,op_flags,bh);}EXPORT_SYMBOL(write_dirty_buffer);/* * For a data-integrity writeout, we need to wait upon any in-progress I/O * and then start new I/O and then wait upon it. The caller must have a ref on * the buffer_head. */int__sync_dirty_buffer(structbuffer_head*bh,intop_flags){intret=0;WARN_ON(atomic_read(&bh->b_count)<1);lock_buffer(bh);if(test_clear_buffer_dirty(bh)){get_bh(bh);bh->b_end_io=end_buffer_write_sync;ret=submit_bh(REQ_OP_WRITE,op_flags,bh);wait_on_buffer(bh);if(!ret&&!buffer_uptodate(bh))ret=-EIO;}else{unlock_buffer(bh);}returnret;}EXPORT_SYMBOL(__sync_dirty_buffer);intsync_dirty_buffer(structbuffer_head*bh){return__sync_dirty_buffer(bh,REQ_SYNC);}EXPORT_SYMBOL(sync_dirty_buffer);/* * try_to_free_buffers() checks if all the buffers on this particular page * are unused, and releases them if so. * * Exclusion against try_to_free_buffers may be obtained by either * locking the page or by holding its mapping's private_lock. * * If the page is dirty but all the buffers are clean then we need to * be sure to mark the page clean as well. This is because the page * may be against a block device, and a later reattachment of buffers * to a dirty page will set *all* buffers dirty. Which would corrupt * filesystem data on the same device. * * The same applies to regular filesystem pages: if all the buffers are * clean then we set the page clean and proceed. To do that, we require * total exclusion from __set_page_dirty_buffers(). That is obtained with * private_lock. * * try_to_free_buffers() is non-blocking. */staticinlineintbuffer_busy(structbuffer_head*bh){returnatomic_read(&bh->b_count)|(bh->b_state&((1<<BH_Dirty)|(1<<BH_Lock)));}staticintdrop_buffers(structpage*page,structbuffer_head**buffers_to_free){structbuffer_head*head=page_buffers(page);structbuffer_head*bh;bh=head;do{if(buffer_busy(bh))gotofailed;bh=bh->b_this_page;}while(bh!=head);do{structbuffer_head*next=bh->b_this_page;if(bh->b_assoc_map)__remove_assoc_queue(bh);bh=next;}while(bh!=head);*buffers_to_free=head;__clear_page_buffers(page);return1;failed:return0;}inttry_to_free_buffers(structpage*page){structaddress_space*constmapping=page->mapping;structbuffer_head*buffers_to_free=NULL;intret=0;BUG_ON(!PageLocked(page));if(PageWriteback(page))return0;if(mapping==NULL){/* can this still happen? */ret=drop_buffers(page,&buffers_to_free);gotoout;}spin_lock(&mapping->private_lock);ret=drop_buffers(page,&buffers_to_free);/* * If the filesystem writes its buffers by hand (eg ext3) * then we can have clean buffers against a dirty page. We * clean the page here; otherwise the VM will never notice * that the filesystem did any IO at all. * * Also, during truncate, discard_buffer will have marked all * the page's buffers clean. We discover that here and clean * the page also. * * private_lock must be held over this entire operation in order * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */if(ret)cancel_dirty_page(page);spin_unlock(&mapping->private_lock);out:if(buffers_to_free){structbuffer_head*bh=buffers_to_free;do{structbuffer_head*next=bh->b_this_page;free_buffer_head(bh);bh=next;}while(bh!=buffers_to_free);}returnret;}EXPORT_SYMBOL(try_to_free_buffers);/* * There are no bdflush tunables left. But distributions are * still running obsolete flush daemons, so we terminate them here. * * Use of bdflush() is deprecated and will be removed in a future kernel. * The `flush-X' kernel threads fully replace bdflush daemons and this call. */SYSCALL_DEFINE2(bdflush,int,func,long,data){staticintmsg_count;if(!capable(CAP_SYS_ADMIN))return-EPERM;if(msg_count<5){msg_count++;printk(KERN_INFO"warning: process `%s' used the obsolete bdflush"" system call\n",current->comm);printk(KERN_INFO"Fix your initscripts?\n");}if(func==1)do_exit(0);return0;}/* * Buffer-head allocation */staticstructkmem_cache*bh_cachep__read_mostly;/* * Once the number of bh's in the machine exceeds this level, we start * stripping them in writeback. */staticunsignedlongmax_buffer_heads;intbuffer_heads_over_limit;structbh_accounting{intnr;/* Number of live bh's */intratelimit;/* Limit cacheline bouncing */};staticDEFINE_PER_CPU(structbh_accounting,bh_accounting)={0,0};staticvoidrecalc_bh_state(void){inti;inttot=0;if(__this_cpu_inc_return(bh_accounting.ratelimit)-1<4096)return;__this_cpu_write(bh_accounting.ratelimit,0);for_each_online_cpu(i)tot+=per_cpu(bh_accounting,i).nr;buffer_heads_over_limit=(tot>max_buffer_heads);}structbuffer_head*alloc_buffer_head(gfp_tgfp_flags){structbuffer_head*ret=kmem_cache_zalloc(bh_cachep,gfp_flags);if(ret){INIT_LIST_HEAD(&ret->b_assoc_buffers);preempt_disable();__this_cpu_inc(bh_accounting.nr);recalc_bh_state();preempt_enable();}returnret;}EXPORT_SYMBOL(alloc_buffer_head);voidfree_buffer_head(structbuffer_head*bh){BUG_ON(!list_empty(&bh->b_assoc_buffers));kmem_cache_free(bh_cachep,bh);preempt_disable();__this_cpu_dec(bh_accounting.nr);recalc_bh_state();preempt_enable();}EXPORT_SYMBOL(free_buffer_head);staticintbuffer_exit_cpu_dead(unsignedintcpu){inti;structbh_lru*b=&per_cpu(bh_lrus,cpu);for(i=0;i<BH_LRU_SIZE;i++){brelse(b->bhs[i]);b->bhs[i]=NULL;}this_cpu_add(bh_accounting.nr,per_cpu(bh_accounting,cpu).nr);per_cpu(bh_accounting,cpu).nr=0;return0;}/** * bh_uptodate_or_lock - Test whether the buffer is uptodate * @bh: struct buffer_head * * Return true if the buffer is up-to-date and false, * with the buffer locked, if not. */intbh_uptodate_or_lock(structbuffer_head*bh){if(!buffer_uptodate(bh)){lock_buffer(bh);if(!buffer_uptodate(bh))return0;unlock_buffer(bh);}return1;}EXPORT_SYMBOL(bh_uptodate_or_lock);/** * bh_submit_read - Submit a locked buffer for reading * @bh: struct buffer_head * * Returns zero on success and -EIO on error. */intbh_submit_read(structbuffer_head*bh){BUG_ON(!buffer_locked(bh));if(buffer_uptodate(bh)){unlock_buffer(bh);return0;}get_bh(bh);bh->b_end_io=end_buffer_read_sync;submit_bh(REQ_OP_READ,0,bh);wait_on_buffer(bh);if(buffer_uptodate(bh))return0;return-EIO;}EXPORT_SYMBOL(bh_submit_read);void__initbuffer_init(void){unsignedlongnrpages;intret;bh_cachep=kmem_cache_create("buffer_head",sizeof(structbuffer_head),0,(SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD),NULL);/* * Limit the bh occupancy to 10% of ZONE_NORMAL */nrpages=(nr_free_buffer_pages()*10)/100;max_buffer_heads=nrpages*(PAGE_SIZE/sizeof(structbuffer_head));ret=cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD,"fs/buffer:dead",NULL,buffer_exit_cpu_dead);WARN_ON(ret<0);}