diff options
author | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 10:22:15 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2011-04-26 10:22:59 +0200 |
commit | 07f9479a40cc778bc1462ada11f95b01360ae4ff (patch) | |
tree | 0676cf38df3844004bb3ebfd99dfa67a4a8998f5 /fs/xfs/linux-2.6 | |
parent | 9d5e6bdb3013acfb311ab407eeca0b6a6a3dedbf (diff) | |
parent | cd2e49e90f1cae7726c9a2c54488d881d7f1cd1c (diff) |
Merge branch 'master' into for-next
Fast-forwarded to current state of Linus' tree as there are patches to be
applied for files that didn't exist on the old branch.
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r-- | fs/xfs/linux-2.6/kmem.c | 9 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 12 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 392 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.h | 40 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_file.c | 8 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_ioctl.c | 4 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_iops.c | 2 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_linux.h | 23 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_message.c | 126 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_message.h | 40 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_super.c | 293 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 265 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.h | 2 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sysctl.c | 2 |
14 files changed, 595 insertions, 623 deletions
diff --git a/fs/xfs/linux-2.6/kmem.c b/fs/xfs/linux-2.6/kmem.c index 666c9db48eb..a907de565db 100644 --- a/fs/xfs/linux-2.6/kmem.c +++ b/fs/xfs/linux-2.6/kmem.c @@ -23,6 +23,7 @@ #include <linux/backing-dev.h> #include "time.h" #include "kmem.h" +#include "xfs_message.h" /* * Greedy allocation. May fail and may return vmalloced memory. @@ -56,8 +57,8 @@ kmem_alloc(size_t size, unsigned int __nocast flags) if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) - printk(KERN_ERR "XFS: possible memory allocation " - "deadlock in %s (mode:0x%x)\n", + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); @@ -112,8 +113,8 @@ kmem_zone_alloc(kmem_zone_t *zone, unsigned int __nocast flags) if (ptr || (flags & (KM_MAYFAIL|KM_NOSLEEP))) return ptr; if (!(++retries % 100)) - printk(KERN_ERR "XFS: possible memory allocation " - "deadlock in %s (mode:0x%x)\n", + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", __func__, lflags); congestion_wait(BLK_RW_ASYNC, HZ/50); } while (1); diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index ec7bbb5645b..79ce38be15a 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -413,8 +413,7 @@ xfs_submit_ioend_bio( if (xfs_ioend_new_eof(ioend)) xfs_mark_inode_dirty(XFS_I(ioend->io_inode)); - submit_bio(wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC_PLUG : WRITE, bio); + submit_bio(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE, bio); } STATIC struct bio * @@ -854,7 +853,7 @@ xfs_aops_discard_page( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) goto out_invalidate; - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + xfs_alert(ip->i_mount, "page discard on page %p, inode 0x%llx, offset %llu.", page, ip->i_ino, offset); @@ -872,7 +871,7 @@ xfs_aops_discard_page( if (error) { /* something screwed, just bail */ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + xfs_alert(ip->i_mount, "page discard unable to remove delalloc mapping."); } break; @@ -1296,7 +1295,7 @@ xfs_get_blocks_direct( * If the private argument is non-NULL __xfs_get_blocks signals us that we * need to issue a transaction to convert the range from unwritten to written * extents. In case this is regular synchronous I/O we just call xfs_end_io - * to do this and we are done. But in case this was a successfull AIO + * to do this and we are done. But in case this was a successful AIO * request this handler is called from interrupt context, from which we * can't start transactions. In that case offload the I/O completion to * the workqueues we also use for buffered I/O completion. @@ -1411,7 +1410,7 @@ xfs_vm_write_failed( if (error) { /* something screwed, just bail */ if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_ALERT, ip->i_mount, + xfs_alert(ip->i_mount, "xfs_vm_write_failed: unable to clean up ino %lld", ip->i_ino); } @@ -1495,7 +1494,6 @@ const struct address_space_operations xfs_address_space_operations = { .readpages = xfs_vm_readpages, .writepage = xfs_vm_writepage, .writepages = xfs_vm_writepages, - .sync_page = block_sync_page, .releasepage = xfs_vm_releasepage, .invalidatepage = xfs_vm_invalidatepage, .write_begin = xfs_vm_write_begin, diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index f83a4c830a6..9ef9ed2cfe2 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -94,75 +94,6 @@ xfs_buf_vmap_len( } /* - * Page Region interfaces. - * - * For pages in filesystems where the blocksize is smaller than the - * pagesize, we use the page->private field (long) to hold a bitmap - * of uptodate regions within the page. - * - * Each such region is "bytes per page / bits per long" bytes long. - * - * NBPPR == number-of-bytes-per-page-region - * BTOPR == bytes-to-page-region (rounded up) - * BTOPRT == bytes-to-page-region-truncated (rounded down) - */ -#if (BITS_PER_LONG == 32) -#define PRSHIFT (PAGE_CACHE_SHIFT - 5) /* (32 == 1<<5) */ -#elif (BITS_PER_LONG == 64) -#define PRSHIFT (PAGE_CACHE_SHIFT - 6) /* (64 == 1<<6) */ -#else -#error BITS_PER_LONG must be 32 or 64 -#endif -#define NBPPR (PAGE_CACHE_SIZE/BITS_PER_LONG) -#define BTOPR(b) (((unsigned int)(b) + (NBPPR - 1)) >> PRSHIFT) -#define BTOPRT(b) (((unsigned int)(b) >> PRSHIFT)) - -STATIC unsigned long -page_region_mask( - size_t offset, - size_t length) -{ - unsigned long mask; - int first, final; - - first = BTOPR(offset); - final = BTOPRT(offset + length - 1); - first = min(first, final); - - mask = ~0UL; - mask <<= BITS_PER_LONG - (final - first); - mask >>= BITS_PER_LONG - (final); - - ASSERT(offset + length <= PAGE_CACHE_SIZE); - ASSERT((final - first) < BITS_PER_LONG && (final - first) >= 0); - - return mask; -} - -STATIC void -set_page_region( - struct page *page, - size_t offset, - size_t length) -{ - set_page_private(page, - page_private(page) | page_region_mask(offset, length)); - if (page_private(page) == ~0UL) - SetPageUptodate(page); -} - -STATIC int -test_page_region( - struct page *page, - size_t offset, - size_t length) -{ - unsigned long mask = page_region_mask(offset, length); - - return (mask && (page_private(page) & mask) == mask); -} - -/* * xfs_buf_lru_add - add a buffer to the LRU. * * The LRU takes a new reference to the buffer so that it will only be freed @@ -189,7 +120,7 @@ xfs_buf_lru_add( * The unlocked check is safe here because it only occurs when there are not * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there * to optimise the shrinker removing the buffer from the LRU and calling - * xfs_buf_free(). i.e. it removes an unneccessary round trip on the + * xfs_buf_free(). i.e. it removes an unnecessary round trip on the * bt_lru_lock. */ STATIC void @@ -332,7 +263,7 @@ xfs_buf_free( ASSERT(list_empty(&bp->b_lru)); - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { + if (bp->b_flags & _XBF_PAGES) { uint i; if (xfs_buf_is_vmapped(bp)) @@ -342,56 +273,77 @@ xfs_buf_free( for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; - if (bp->b_flags & _XBF_PAGE_CACHE) - ASSERT(!PagePrivate(page)); - page_cache_release(page); + __free_page(page); } - } + } else if (bp->b_flags & _XBF_KMEM) + kmem_free(bp->b_addr); _xfs_buf_free_pages(bp); xfs_buf_deallocate(bp); } /* - * Finds all pages for buffer in question and builds it's page list. + * Allocates all the pages for buffer in question and builds it's page list. */ STATIC int -_xfs_buf_lookup_pages( +xfs_buf_allocate_memory( xfs_buf_t *bp, uint flags) { - struct address_space *mapping = bp->b_target->bt_mapping; - size_t blocksize = bp->b_target->bt_bsize; size_t size = bp->b_count_desired; size_t nbytes, offset; gfp_t gfp_mask = xb_to_gfp(flags); unsigned short page_count, i; - pgoff_t first; xfs_off_t end; int error; + /* + * for buffers that are contained within a single page, just allocate + * the memory from the heap - there's no need for the complexity of + * page arrays to keep allocation down to order 0. + */ + if (bp->b_buffer_length < PAGE_SIZE) { + bp->b_addr = kmem_alloc(bp->b_buffer_length, xb_to_km(flags)); + if (!bp->b_addr) { + /* low memory - use alloc_page loop instead */ + goto use_alloc_page; + } + + if (((unsigned long)(bp->b_addr + bp->b_buffer_length - 1) & + PAGE_MASK) != + ((unsigned long)bp->b_addr & PAGE_MASK)) { + /* b_addr spans two pages - use alloc_page instead */ + kmem_free(bp->b_addr); + bp->b_addr = NULL; + goto use_alloc_page; + } + bp->b_offset = offset_in_page(bp->b_addr); + bp->b_pages = bp->b_page_array; + bp->b_pages[0] = virt_to_page(bp->b_addr); + bp->b_page_count = 1; + bp->b_flags |= XBF_MAPPED | _XBF_KMEM; + return 0; + } + +use_alloc_page: end = bp->b_file_offset + bp->b_buffer_length; page_count = xfs_buf_btoc(end) - xfs_buf_btoct(bp->b_file_offset); - error = _xfs_buf_get_pages(bp, page_count, flags); if (unlikely(error)) return error; - bp->b_flags |= _XBF_PAGE_CACHE; offset = bp->b_offset; - first = bp->b_file_offset >> PAGE_CACHE_SHIFT; + bp->b_flags |= _XBF_PAGES; for (i = 0; i < bp->b_page_count; i++) { struct page *page; uint retries = 0; - - retry: - page = find_or_create_page(mapping, first + i, gfp_mask); +retry: + page = alloc_page(gfp_mask); if (unlikely(page == NULL)) { if (flags & XBF_READ_AHEAD) { bp->b_page_count = i; - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - return -ENOMEM; + error = ENOMEM; + goto out_free_pages; } /* @@ -401,9 +353,8 @@ _xfs_buf_lookup_pages( * handle buffer allocation failures we can't do much. */ if (!(++retries % 100)) - printk(KERN_ERR - "XFS: possible memory allocation " - "deadlock in %s (mode:0x%x)\n", + xfs_err(NULL, + "possible memory allocation deadlock in %s (mode:0x%x)", __func__, gfp_mask); XFS_STATS_INC(xb_page_retries); @@ -413,52 +364,44 @@ _xfs_buf_lookup_pages( XFS_STATS_INC(xb_page_found); - nbytes = min_t(size_t, size, PAGE_CACHE_SIZE - offset); + nbytes = min_t(size_t, size, PAGE_SIZE - offset); size -= nbytes; - - ASSERT(!PagePrivate(page)); - if (!PageUptodate(page)) { - page_count--; - if (blocksize >= PAGE_CACHE_SIZE) { - if (flags & XBF_READ) - bp->b_flags |= _XBF_PAGE_LOCKED; - } else if (!PagePrivate(page)) { - if (test_page_region(page, offset, nbytes)) - page_count++; - } - } - bp->b_pages[i] = page; offset = 0; } + return 0; - if (!(bp->b_flags & _XBF_PAGE_LOCKED)) { - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - } - - if (page_count == bp->b_page_count) - bp->b_flags |= XBF_DONE; - +out_free_pages: + for (i = 0; i < bp->b_page_count; i++) + __free_page(bp->b_pages[i]); return error; } /* - * Map buffer into kernel address-space if nessecary. + * Map buffer into kernel address-space if necessary. */ STATIC int _xfs_buf_map_pages( xfs_buf_t *bp, uint flags) { - /* A single page buffer is always mappable */ + ASSERT(bp->b_flags & _XBF_PAGES); if (bp->b_page_count == 1) { + /* A single page buffer is always mappable */ bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); - if (unlikely(bp->b_addr == NULL)) + int retried = 0; + + do { + bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, + -1, PAGE_KERNEL); + if (bp->b_addr) + break; + vm_unmap_aliases(); + } while (retried++ <= 1); + + if (!bp->b_addr) return -ENOMEM; bp->b_addr += bp->b_offset; bp->b_flags |= XBF_MAPPED; @@ -569,9 +512,14 @@ found: } } + /* + * if the buffer is stale, clear all the external state associated with + * it. We need to keep flags such as how we allocated the buffer memory + * intact here. + */ if (bp->b_flags & XBF_STALE) { ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= XBF_MAPPED; + bp->b_flags &= XBF_MAPPED | _XBF_KMEM | _XBF_PAGES; } trace_xfs_buf_find(bp, flags, _RET_IP_); @@ -592,7 +540,7 @@ xfs_buf_get( xfs_buf_flags_t flags) { xfs_buf_t *bp, *new_bp; - int error = 0, i; + int error = 0; new_bp = xfs_buf_allocate(flags); if (unlikely(!new_bp)) @@ -600,7 +548,7 @@ xfs_buf_get( bp = _xfs_buf_find(target, ioff, isize, flags, new_bp); if (bp == new_bp) { - error = _xfs_buf_lookup_pages(bp, flags); + error = xfs_buf_allocate_memory(bp, flags); if (error) goto no_buffer; } else { @@ -609,14 +557,11 @@ xfs_buf_get( return NULL; } - for (i = 0; i < bp->b_page_count; i++) - mark_page_accessed(bp->b_pages[i]); - if (!(bp->b_flags & XBF_MAPPED)) { error = _xfs_buf_map_pages(bp, flags); if (unlikely(error)) { - printk(KERN_WARNING "%s: failed to map pages\n", - __func__); + xfs_warn(target->bt_mount, + "%s: failed to map pages\n", __func__); goto no_buffer; } } @@ -710,10 +655,7 @@ xfs_buf_readahead( xfs_off_t ioff, size_t isize) { - struct backing_dev_info *bdi; - - bdi = target->bt_mapping->backing_dev_info; - if (bdi_read_congested(bdi)) + if (bdi_read_congested(target->bt_bdi)) return; xfs_buf_read(target, ioff, isize, @@ -791,10 +733,10 @@ xfs_buf_associate_memory( size_t buflen; int page_count; - pageaddr = (unsigned long)mem & PAGE_CACHE_MASK; + pageaddr = (unsigned long)mem & PAGE_MASK; offset = (unsigned long)mem - pageaddr; - buflen = PAGE_CACHE_ALIGN(len + offset); - page_count = buflen >> PAGE_CACHE_SHIFT; + buflen = PAGE_ALIGN(len + offset); + page_count = buflen >> PAGE_SHIFT; /* Free any previous set of page pointers */ if (bp->b_pages) @@ -811,13 +753,12 @@ xfs_buf_associate_memory( for (i = 0; i < bp->b_page_count; i++) { bp->b_pages[i] = mem_to_page((void *)pageaddr); - pageaddr += PAGE_CACHE_SIZE; + pageaddr += PAGE_SIZE; } bp->b_count_desired = len; bp->b_buffer_length = buflen; bp->b_flags |= XBF_MAPPED; - bp->b_flags &= ~_XBF_PAGE_LOCKED; return 0; } @@ -850,8 +791,8 @@ xfs_buf_get_uncached( error = _xfs_buf_map_pages(bp, XBF_MAPPED); if (unlikely(error)) { - printk(KERN_WARNING "%s: failed to map pages\n", - __func__); + xfs_warn(target->bt_mount, + "%s: failed to map pages\n", __func__); goto fail_free_mem; } @@ -924,20 +865,7 @@ xfs_buf_rele( /* - * Mutual exclusion on buffers. Locking model: - * - * Buffers associated with inodes for which buffer locking - * is not enabled are not protected by semaphores, and are - * assumed to be exclusively owned by the caller. There is a - * spinlock in the buffer, used by the caller when concurrent - * access is possible. - */ - -/* - * Locks a buffer object, if it is not already locked. Note that this in - * no way locks the underlying pages, so it is only useful for - * synchronizing concurrent use of buffer objects, not for synchronizing - * independent access to the underlying pages. + * Lock a buffer object, if it is not already locked. * * If we come across a stale, pinned, locked buffer, we know that we are * being asked to lock a buffer that has been reallocated. Because it is @@ -971,10 +899,7 @@ xfs_buf_lock_value( } /* - * Locks a buffer object. - * Note that this in no way locks the underlying pages, so it is only - * useful for synchronizing concurrent use of buffer objects, not for - * synchronizing independent access to the underlying pages. + * Lock a buffer object. * * If we come across a stale, pinned, locked buffer, we know that we * are being asked to lock a buffer that has been reallocated. Because @@ -990,8 +915,6 @@ xfs_buf_lock( if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) xfs_log_force(bp->b_target->bt_mount, 0); - if (atomic_read(&bp->b_io_remaining)) - blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); XB_SET_OWNER(bp); @@ -1035,9 +958,7 @@ xfs_buf_wait_unpin( set_current_state(TASK_UNINTERRUPTIBLE); if (atomic_read(&bp->b_pin_count) == 0) break; - if (atomic_read(&bp->b_io_remaining)) - blk_run_address_space(bp->b_target->bt_mapping); - schedule(); + io_schedule(); } remove_wait_queue(&bp->b_waiters, &wait); set_current_state(TASK_RUNNING); @@ -1249,10 +1170,8 @@ _xfs_buf_ioend( xfs_buf_t *bp, int schedule) { - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { - bp->b_flags &= ~_XBF_PAGE_LOCKED; + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) xfs_buf_ioend(bp, schedule); - } } STATIC void @@ -1261,35 +1180,12 @@ xfs_buf_bio_end_io( int error) { xfs_buf_t *bp = (xfs_buf_t *)bio->bi_private; - unsigned int blocksize = bp->b_target->bt_bsize; - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; xfs_buf_ioerror(bp, -error); if (!error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - do { - struct page *page = bvec->bv_page; - - ASSERT(!PagePrivate(page)); - if (unlikely(bp->b_error)) { - if (bp->b_flags & XBF_READ) - ClearPageUptodate(page); - } else if (blocksize >= PAGE_CACHE_SIZE) { - SetPageUptodate(page); - } else if (!PagePrivate(page) && - (bp->b_flags & _XBF_PAGE_CACHE)) { - set_page_region(page, bvec->bv_offset, bvec->bv_len); - } - - if (--bvec >= bio->bi_io_vec) - prefetchw(&bvec->bv_page->flags); - - if (bp->b_flags & _XBF_PAGE_LOCKED) - unlock_page(page); - } while (bvec >= bio->bi_io_vec); - _xfs_buf_ioend(bp, 1); bio_put(bio); } @@ -1303,7 +1199,6 @@ _xfs_buf_ioapply( int offset = bp->b_offset; int size = bp->b_count_desired; sector_t sector = bp->b_bn; - unsigned int blocksize = bp->b_target->bt_bsize; total_nr_pages = bp->b_page_count; map_i = 0; @@ -1324,29 +1219,6 @@ _xfs_buf_ioapply( (bp->b_flags & XBF_READ_AHEAD) ? READA : READ; } - /* Special code path for reading a sub page size buffer in -- - * we populate up the whole page, and hence the other metadata - * in the same page. This optimization is only valid when the - * filesystem block size is not smaller than the page size. - */ - if ((bp->b_buffer_length < PAGE_CACHE_SIZE) && - ((bp->b_flags & (XBF_READ|_XBF_PAGE_LOCKED)) == - (XBF_READ|_XBF_PAGE_LOCKED)) && - (blocksize >= PAGE_CACHE_SIZE)) { - bio = bio_alloc(GFP_NOIO, 1); - - bio->bi_bdev = bp->b_target->bt_bdev; - bio->bi_sector = sector - (offset >> BBSHIFT); - bio->bi_end_io = xfs_buf_bio_end_io; - bio->bi_private = bp; - - bio_add_page(bio, bp->b_pages[0], PAGE_CACHE_SIZE, 0); - size = 0; - - atomic_inc(&bp->b_io_remaining); - - goto submit_io; - } next_chunk: atomic_inc(&bp->b_io_remaining); @@ -1360,8 +1232,9 @@ next_chunk: bio->bi_end_io = xfs_buf_bio_end_io; bio->bi_private = bp; + for (; size && nr_pages; nr_pages--, map_i++) { - int rbytes, nbytes = PAGE_CACHE_SIZE - offset; + int rbytes, nbytes = PAGE_SIZE - offset; if (nbytes > size) nbytes = size; @@ -1376,7 +1249,6 @@ next_chunk: total_nr_pages--; } -submit_io: if (likely(bio->bi_size)) { if (xfs_buf_is_vmapped(bp)) { flush_kernel_vmap_range(bp->b_addr, @@ -1386,18 +1258,7 @@ submit_io: if (size) goto next_chunk; } else { - /* - * if we get here, no pages were added to the bio. However, - * we can't just error out here - if the pages are locked then - * we have to unlock them otherwise we can hang on a later - * access to the page. - */ xfs_buf_ioerror(bp, EIO); - if (bp->b_flags & _XBF_PAGE_LOCKED) { - int i; - for (i = 0; i < bp->b_page_count; i++) - unlock_page(bp->b_pages[i]); - } bio_put(bio); } } @@ -1442,8 +1303,6 @@ xfs_buf_iowait( { trace_xfs_buf_iowait(bp, _RET_IP_); - if (atomic_read(&bp->b_io_remaining)) - blk_run_address_space(bp->b_target->bt_mapping); wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); @@ -1461,8 +1320,8 @@ xfs_buf_offset( return XFS_BUF_PTR(bp) + offset; offset += bp->b_offset; - page = bp->b_pages[offset >> PAGE_CACHE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_CACHE_SIZE-1)); + page = bp->b_pages[offset >> PAGE_SHIFT]; + return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); } /* @@ -1484,9 +1343,9 @@ xfs_buf_iomove( page = bp->b_pages[xfs_buf_btoct(boff + bp->b_offset)]; cpoff = xfs_buf_poff(boff + bp->b_offset); csize = min_t(size_t, - PAGE_CACHE_SIZE-cpoff, bp->b_count_desired-boff); + PAGE_SIZE-cpoff, bp->b_count_desired-boff); - ASSERT(((csize + cpoff) <= PAGE_CACHE_SIZE)); + ASSERT(((csize + cpoff) <= PAGE_SIZE)); switch (mode) { case XBRW_ZERO: @@ -1599,7 +1458,6 @@ xfs_free_buftarg( xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - iput(btp->bt_mapping->host); kthread_stop(btp->bt_task); kmem_free(btp); @@ -1617,21 +1475,12 @@ xfs_setsize_buftarg_flags( btp->bt_smask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { - printk(KERN_WARNING - "XFS: Cannot set_blocksize to %u on device %s\n", + xfs_warn(btp->bt_mount, + "Cannot set_blocksize to %u on device %s\n", sectorsize, XFS_BUFTARG_NAME(btp)); return EINVAL; } - if (verbose && - (PAGE_CACHE_SIZE / BITS_PER_LONG) > sectorsize) { - printk(KERN_WARNING - "XFS: %u byte sectors in use on device %s. " - "This is suboptimal; %u or greater is ideal.\n", - sectorsize, XFS_BUFTARG_NAME(btp), - (unsigned int)PAGE_CACHE_SIZE / BITS_PER_LONG); - } - return 0; } @@ -1646,7 +1495,7 @@ xfs_setsize_buftarg_early( struct block_device *bdev) { return xfs_setsize_buftarg_flags(btp, - PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); + PAGE_SIZE, bdev_logical_block_size(bdev), 0); } int @@ -1659,41 +1508,6 @@ xfs_setsize_buftarg( } STATIC int -xfs_mapping_buftarg( - xfs_buftarg_t *btp, - struct block_device *bdev) -{ - struct backing_dev_info *bdi; - struct inode *inode; - struct address_space *mapping; - static const struct address_space_operations mapping_aops = { - .sync_page = block_sync_page, - .migratepage = fail_migrate_page, - }; - - inode = new_inode(bdev->bd_inode->i_sb); - if (!inode) { - printk(KERN_WARNING - "XFS: Cannot allocate mapping inode for device %s\n", - XFS_BUFTARG_NAME(btp)); - return ENOMEM; - } - inode->i_ino = get_next_ino(); - inode->i_mode = S_IFBLK; - inode->i_bdev = bdev; - inode->i_rdev = bdev->bd_dev; - bdi = blk_get_backing_dev_info(bdev); - if (!bdi) - bdi = &default_backing_dev_info; - mapping = &inode->i_data; - mapping->a_ops = &mapping_aops; - mapping->backing_dev_info = bdi; - mapping_set_gfp_mask(mapping, GFP_NOFS); - btp->bt_mapping = mapping; - return 0; -} - -STATIC int xfs_alloc_delwrite_queue( xfs_buftarg_t *btp, const char *fsname) @@ -1721,12 +1535,14 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; + btp->bt_bdi = blk_get_backing_dev_info(bdev); + if (!btp->bt_bdi) + goto error; + INIT_LIST_HEAD(&btp->bt_lru); spin_lock_init(&btp->bt_lru_lock); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; - if (xfs_mapping_buftarg(btp, bdev)) - goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; btp->bt_shrinker.shrink = xfs_buftarg_shrink; @@ -1923,8 +1739,8 @@ xfsbufd( do { long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); - int count = 0; struct list_head tmp; + struct blk_plug plug; if (unlikely(freezing(current))) { set_bit(XBT_FORCE_SLEEP, &target->bt_flags); @@ -1940,16 +1756,15 @@ xfsbufd( xfs_buf_delwri_split(target, &tmp, age); list_sort(NULL, &tmp, xfs_buf_cmp); + + blk_start_plug(&plug); while (!list_empty(&tmp)) { struct xfs_buf *bp; bp = list_first_entry(&tmp, struct xfs_buf, b_list); list_del_init(&bp->b_list); xfs_bdstrat_cb(bp); - count++; } - if (count) - blk_run_address_space(target->bt_mapping); - + blk_finish_plug(&plug); } while (!kthread_should_stop()); return 0; @@ -1969,6 +1784,7 @@ xfs_flush_buftarg( int pincount = 0; LIST_HEAD(tmp_list); LIST_HEAD(wait_list); + struct blk_plug plug; xfs_buf_runall_queues(xfsconvertd_workqueue); xfs_buf_runall_queues(xfsdatad_workqueue); @@ -1983,6 +1799,8 @@ xfs_flush_buftarg( * we do that after issuing all the IO. */ list_sort(NULL, &tmp_list, xfs_buf_cmp); + + blk_start_plug(&plug); while (!list_empty(&tmp_list)) { bp = list_first_entry(&tmp_list, struct xfs_buf, b_list); ASSERT(target == bp->b_target); @@ -1993,10 +1811,10 @@ xfs_flush_buftarg( } xfs_bdstrat_cb(bp); } + blk_finish_plug(&plug); if (wait) { - /* Expedite and wait for IO to complete. */ - blk_run_address_space(target->bt_mapping); + /* Wait for IO to complete. */ while (!list_empty(&wait_list)) { bp = list_first_entry(&wait_list, struct xfs_buf, b_list); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index cbe65950e52..a9a1c451264 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -61,30 +61,11 @@ typedef enum { #define XBF_DONT_BLOCK (1 << 16)/* do not block in current thread */ /* flags used only internally */ -#define _XBF_PAGE_CACHE (1 << 17)/* backed by pagecache */ #define _XBF_PAGES (1 << 18)/* backed by refcounted pages */ #define _XBF_RUN_QUEUES (1 << 19)/* run block device task queue */ +#define _XBF_KMEM (1 << 20)/* backed by heap memory */ #define _XBF_DELWRI_Q (1 << 21)/* buffer on delwri queue */ -/* - * Special flag for supporting metadata blocks smaller than a FSB. - * - * In this case we can have multiple xfs_buf_t on a single page and - * need to lock out concurrent xfs_buf_t readers as they only - * serialise access to the buffer. - * - * If the FSB size >= PAGE_CACHE_SIZE case, we have no serialisation - * between reads of the page. Hence we can have one thread read the - * page and modify it, but then race with another thread that thinks - * the page is not up-to-date and hence reads it again. - * - * The result is that the first modifcation to the page is lost. - * This sort of AGF/AGI reading race can happen when unlinking inodes - * that require truncation and results in the AGI unlinked list - * modifications being lost. - */ -#define _XBF_PAGE_LOCKED (1 << 22) - typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -100,12 +81,10 @@ typedef unsigned int xfs_buf_flags_t; { XBF_LOCK, "LOCK" }, /* should never be set */\ { XBF_TRYLOCK, "TRYLOCK" }, /* ditto */\ { XBF_DONT_BLOCK, "DONT_BLOCK" }, /* ditto */\ - { _XBF_PAGE_CACHE, "PAGE_CACHE" }, \ { _XBF_PAGES, "PAGES" }, \ { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ - { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } - + { _XBF_KMEM, "KMEM" }, \ + { _XBF_DELWRI_Q, "DELWRI_Q" } typedef enum { XBT_FORCE_SLEEP = 0, @@ -120,7 +99,7 @@ typedef struct xfs_bufhash { typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; - struct address_space *bt_mapping; + struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; unsigned int bt_bsize; unsigned int bt_sshift; @@ -139,17 +118,6 @@ typedef struct xfs_buftarg { unsigned int bt_lru_nr; } xfs_buftarg_t; -/* - * xfs_buf_t: Buffer structure for pagecache-based buffers - * - * This buffer structure is used by the pagecache buffer management routines - * to refer to an assembly of pages forming a logical buffer. - * - * The buffer structure is used on a temporary basis only, and discarded when - * released. The real data storage is recorded in the pagecache. Buffers are - * hashed to the block device on which the file system resides. - */ - struct xfs_buf; typedef void (*xfs_buf_iodone_t)(struct xfs_buf *); diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index a55c1b46b21..f4213ba1ff8 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -381,7 +381,7 @@ xfs_aio_write_isize_update( /* * If this was a direct or synchronous I/O that failed (such as ENOSPC) then - * part of the I/O may have been written to disk before the error occured. In + * part of the I/O may have been written to disk before the error occurred. In * this case the on-disk file size may have been adjusted beyond the in-memory * file size and now needs to be truncated back. */ @@ -896,6 +896,7 @@ xfs_file_fallocate( xfs_flock64_t bf; xfs_inode_t *ip = XFS_I(inode); int cmd = XFS_IOC_RESVSP; + int attr_flags = XFS_ATTR_NOLOCK; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; @@ -918,7 +919,10 @@ xfs_file_fallocate( goto out_unlock; } - error = -xfs_change_file_space(ip, cmd, &bf, 0, XFS_ATTR_NOLOCK); + if (file->f_flags & O_DSYNC) + attr_flags |= XFS_ATTR_SYNC; + + error = -xfs_change_file_space(ip, cmd, &bf, 0, attr_flags); if (error) goto out_unlock; diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 0ca0e3c024d..acca2c5ca3f 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -624,6 +624,10 @@ xfs_ioc_space( if (filp->f_flags & (O_NDELAY|O_NONBLOCK)) attr_flags |= XFS_ATTR_NONBLOCK; + + if (filp->f_flags & O_DSYNC) + attr_flags |= XFS_ATTR_SYNC; + if (ioflags & IO_INVIS) attr_flags |= XFS_ATTR_DMI; diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 9ff7fc603d2..dd21784525a 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -70,7 +70,7 @@ xfs_synchronize_times( /* * If the linux inode is valid, mark it dirty. - * Used when commiting a dirty inode into a transaction so that + * Used when committing a dirty inode into a transaction so that * the inode will get written back by the linux code */ void diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 09649499774..244be9cbfe7 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -39,7 +39,6 @@ #include <mrlock.h> #include <time.h> -#include <support/debug.h> #include <support/uuid.h> #include <linux/semaphore.h> @@ -86,6 +85,7 @@ #include <xfs_aops.h> #include <xfs_super.h> #include <xfs_buf.h> +#include <xfs_message.h> /* * Feature macros (disable/enable) @@ -280,4 +280,25 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #define __arch_pack #endif +#define ASSERT_ALWAYS(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef DEBUG +#define ASSERT(expr) ((void)0) + +#ifndef STATIC +# define STATIC static noinline +#endif + +#else /* DEBUG */ + +#define ASSERT(expr) \ + (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) + +#ifndef STATIC +# define STATIC noinline +#endif + +#endif /* DEBUG */ + #endif /* __XFS_LINUX__ */ diff --git a/fs/xfs/linux-2.6/xfs_message.c b/fs/xfs/linux-2.6/xfs_message.c new file mode 100644 index 00000000000..9f76cceb678 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_message.c @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2011 Red Hat, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_types.h" +#include "xfs_log.h" +#include "xfs_inum.h" +#include "xfs_trans.h" +#include "xfs_sb.h" +#include "xfs_ag.h" +#include "xfs_mount.h" + +/* + * XFS logging functions + */ +static void +__xfs_printk( + const char *level, + const struct xfs_mount *mp, + struct va_format *vaf) +{ + if (mp && mp->m_fsname) { + printk("%sXFS (%s): %pV\n", level, mp->m_fsname, vaf); + return; + } + printk("%sXFS: %pV\n", level, vaf); +} + +void xfs_printk( + const char *level, + const struct xfs_mount *mp, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(level, mp, &vaf); + va_end(args); +} + +#define define_xfs_printk_level(func, kern_level) \ +void func(const struct xfs_mount *mp, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + __xfs_printk(kern_level, mp, &vaf); \ + va_end(args); \ +} \ + +define_xfs_printk_level(xfs_emerg, KERN_EMERG); +define_xfs_printk_level(xfs_alert, KERN_ALERT); +define_xfs_printk_level(xfs_crit, KERN_CRIT); +define_xfs_printk_level(xfs_err, KERN_ERR); +define_xfs_printk_level(xfs_warn, KERN_WARNING); +define_xfs_printk_level(xfs_notice, KERN_NOTICE); +define_xfs_printk_level(xfs_info, KERN_INFO); +#ifdef DEBUG +define_xfs_printk_level(xfs_debug, KERN_DEBUG); +#endif + +void +xfs_alert_tag( + const struct xfs_mount *mp, + int panic_tag, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int do_panic = 0; + + if (xfs_panic_mask && (xfs_panic_mask & panic_tag)) { + xfs_printk(KERN_ALERT, mp, + "XFS: Transforming an alert into a BUG."); + do_panic = 1; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + __xfs_printk(KERN_ALERT, mp, &vaf); + va_end(args); + + BUG_ON(do_panic); +} + +void +assfail(char *expr, char *file, int line) +{ + xfs_emerg(NULL, "Assertion failed: %s, file: %s, line: %d", + expr, file, line); + BUG(); +} + +void +xfs_hex_dump(void *p, int length) +{ + print_hex_dump(KERN_ALERT, "", DUMP_PREFIX_ADDRESS, 16, 1, p, length, 1); +} diff --git a/fs/xfs/linux-2.6/xfs_message.h b/fs/xfs/linux-2.6/xfs_message.h new file mode 100644 index 00000000000..f1b3fc1b6c4 --- /dev/null +++ b/fs/xfs/linux-2.6/xfs_message.h @@ -0,0 +1,40 @@ +#ifndef __XFS_MESSAGE_H +#define __XFS_MESSAGE_H 1 + +struct xfs_mount; + +extern void xfs_printk(const char *level, const struct xfs_mount *mp, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); +extern void xfs_emerg(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_alert(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_alert_tag(const struct xfs_mount *mp, int tag, + const char *fmt, ...) + __attribute__ ((format (printf, 3, 4))); +extern void xfs_crit(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_err(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_warn(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_notice(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern void xfs_info(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); + +#ifdef DEBUG +extern void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +#else +static inline void xfs_debug(const struct xfs_mount *mp, const char *fmt, ...) +{ +} +#endif + +extern void assfail(char *expr, char *f, int l); + +extern void xfs_hex_dump(void *p, int length); + +#endif /* __XFS_MESSAGE_H */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 9731898083a..b38e58d0229 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -173,6 +173,15 @@ xfs_parseargs( __uint8_t iosizelog = 0; /* + * set up the mount name first so all the errors will refer to the + * correct device. + */ + mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); + if (!mp->m_fsname) + return ENOMEM; + mp->m_fsname_len = strlen(mp->m_fsname) + 1; + + /* * Copy binary VFS mount flags we are interested in. */ if (sb->s_flags & MS_RDONLY) @@ -189,6 +198,7 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_BARRIER; mp->m_flags |= XFS_MOUNT_COMPAT_IOSIZE; mp->m_flags |= XFS_MOUNT_SMALL_INUMS; + mp->m_flags |= XFS_MOUNT_DELAYLOG; /* * These can be overridden by the mount option parsing. @@ -207,24 +217,21 @@ xfs_parseargs( if (!strcmp(this_char, MNTOPT_LOGBUFS)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } mp->m_logbufs = simple_strtoul(value, &eov, 10); } else if (!strcmp(this_char, MNTOPT_LOGBSIZE)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } mp->m_logbsize = suffix_strtoul(value, &eov, 10); } else if (!strcmp(this_char, MNTOPT_LOGDEV)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } @@ -232,14 +239,12 @@ xfs_parseargs( if (!mp->m_logname) return ENOMEM; } else if (!strcmp(this_char, MNTOPT_MTPT)) { - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", + xfs_warn(mp, "%s option not allowed on this system", this_char); return EINVAL; } else if (!strcmp(this_char, MNTOPT_RTDEV)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } @@ -248,8 +253,7 @@ xfs_parseargs( return ENOMEM; } else if (!strcmp(this_char, MNTOPT_BIOSIZE)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } @@ -257,8 +261,7 @@ xfs_parseargs( iosizelog = ffs(iosize) - 1; } else if (!strcmp(this_char, MNTOPT_ALLOCSIZE)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } @@ -280,16 +283,14 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_SWALLOC; } else if (!strcmp(this_char, MNTOPT_SUNIT)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } dsunit = simple_strtoul(value, &eov, 10); } else if (!strcmp(this_char, MNTOPT_SWIDTH)) { if (!value || !*value) { - cmn_err(CE_WARN, - "XFS: %s option requires an argument", + xfs_warn(mp, "%s option requires an argument", this_char); return EINVAL; } @@ -297,8 +298,7 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_64BITINODE)) { mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS; #if !XFS_BIG_INUMS - cmn_err(CE_WARN, - "XFS: %s option not allowed on this system", + xfs_warn(mp, "%s option not allowed on this system", this_char); return EINVAL; #endif @@ -356,20 +356,19 @@ xfs_parseargs( } else if (!strcmp(this_char, MNTOPT_NODELAYLOG)) { mp->m_flags &= ~XFS_MOUNT_DELAYLOG; } else if (!strcmp(this_char, "ihashsize")) { - cmn_err(CE_WARN, - "XFS: ihashsize no longer used, option is deprecated."); + xfs_warn(mp, + "ihashsize no longer used, option is deprecated."); } else if (!strcmp(this_char, "osyncisdsync")) { - cmn_err(CE_WARN, - "XFS: osyncisdsync has no effect, option is deprecated."); + xfs_warn(mp, + "osyncisdsync has no effect, option is deprecated."); } else if (!strcmp(this_char, "osyncisosync")) { - cmn_err(CE_WARN, - "XFS: osyncisosync has no effect, option is deprecated."); + xfs_warn(mp, + "osyncisosync has no effect, option is deprecated."); } else if (!strcmp(this_char, "irixsgid")) { - cmn_err(CE_WARN, - "XFS: irixsgid is now a sysctl(2) variable, option is deprecated."); + xfs_warn(mp, + "irixsgid is now a sysctl(2) variable, option is deprecated."); } else { - cmn_err(CE_WARN, - "XFS: unknown mount option [%s].", this_char); + xfs_warn(mp, "unknown mount option [%s].", this_char); return EINVAL; } } @@ -379,40 +378,37 @@ xfs_parseargs( */ if ((mp->m_flags & XFS_MOUNT_NORECOVERY) && !(mp->m_flags & XFS_MOUNT_RDONLY)) { - cmn_err(CE_WARN, "XFS: no-recovery mounts must be read-only."); + xfs_warn(mp, "no-recovery mounts must be read-only."); return EINVAL; } if ((mp->m_flags & XFS_MOUNT_NOALIGN) && (dsunit || dswidth)) { - cmn_err(CE_WARN, - "XFS: sunit and swidth options incompatible with the noalign option"); + xfs_warn(mp, + "sunit and swidth options incompatible with the noalign option"); return EINVAL; } #ifndef CONFIG_XFS_QUOTA if (XFS_IS_QUOTA_RUNNING(mp)) { - cmn_err(CE_WARN, - "XFS: quota support not available in this kernel."); + xfs_warn(mp, "quota support not available in this kernel."); return EINVAL; } #endif if ((mp->m_qflags & (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE)) && (mp->m_qflags & (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE))) { - cmn_err(CE_WARN, - "XFS: cannot mount with both project and group quota"); + xfs_warn(mp, "cannot mount with both project and group quota"); return EINVAL; } if ((dsunit && !dswidth) || (!dsunit && dswidth)) { - cmn_err(CE_WARN, - "XFS: sunit and swidth must be specified together"); + xfs_warn(mp, "sunit and swidth must be specified together"); return EINVAL; } if (dsunit && (dswidth % dsunit != 0)) { - cmn_err(CE_WARN, - "XFS: stripe width (%d) must be a multiple of the stripe unit (%d)", + xfs_warn(mp, + "stripe width (%d) must be a multiple of the stripe unit (%d)", dswidth, dsunit); return EINVAL; } @@ -438,8 +434,7 @@ done: mp->m_logbufs != 0 && (mp->m_logbufs < XLOG_MIN_ICLOGS || mp->m_logbufs > XLOG_MAX_ICLOGS)) { - cmn_err(CE_WARN, - "XFS: invalid logbufs value: %d [not %d-%d]", + xfs_warn(mp, "invalid logbufs value: %d [not %d-%d]", mp->m_logbufs, XLOG_MIN_ICLOGS, XLOG_MAX_ICLOGS); return XFS_ERROR(EINVAL); } @@ -448,22 +443,16 @@ done: (mp->m_logbsize < XLOG_MIN_RECORD_BSIZE || mp->m_logbsize > XLOG_MAX_RECORD_BSIZE || !is_power_of_2(mp->m_logbsize))) { - cmn_err(CE_WARN, - "XFS: invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", + xfs_warn(mp, + "invalid logbufsize: %d [not 16k,32k,64k,128k or 256k]", mp->m_logbsize); return XFS_ERROR(EINVAL); } - mp->m_fsname = kstrndup(sb->s_id, MAXNAMELEN, GFP_KERNEL); - if (!mp->m_fsname) - return ENOMEM; - mp->m_fsname_len = strlen(mp->m_fsname) + 1; - if (iosizelog) { if (iosizelog > XFS_MAX_IO_LOG || iosizelog < XFS_MIN_IO_LOG) { - cmn_err(CE_WARN, - "XFS: invalid log iosize: %d [not %d-%d]", + xfs_warn(mp, "invalid log iosize: %d [not %d-%d]", iosizelog, XFS_MIN_IO_LOG, XFS_MAX_IO_LOG); return XFS_ERROR(EINVAL); @@ -610,7 +599,7 @@ xfs_blkdev_get( mp); if (IS_ERR(*bdevp)) { error = PTR_ERR(*bdevp); - printk("XFS: Invalid device [%s], error=%d\n", name, error); + xfs_warn(mp, "Invalid device [%s], error=%d\n", name, error); } return -error; @@ -664,23 +653,23 @@ xfs_mountfs_check_barriers(xfs_mount_t *mp) int error; if (mp->m_logdev_targp != mp->m_ddev_targp) { - xfs_fs_cmn_err(CE_NOTE, mp, + xfs_notice(mp, "Disabling barriers, not supported with external log device"); mp->m_flags &= ~XFS_MOUNT_BARRIER; return; } if (xfs_readonly_buftarg(mp->m_ddev_targp)) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, underlying device is readonly"); + xfs_notice(mp, + "Disabling barriers, underlying device is readonly"); mp->m_flags &= ~XFS_MOUNT_BARRIER; return; } error = xfs_barrier_test(mp); if (error) { - xfs_fs_cmn_err(CE_NOTE, mp, - "Disabling barriers, trial barrier write failed"); + xfs_notice(mp, + "Disabling barriers, trial barrier write failed"); mp->m_flags &= ~XFS_MOUNT_BARRIER; return; } @@ -743,8 +732,8 @@ xfs_open_devices( goto out_close_logdev; if (rtdev == ddev || rtdev == logdev) { - cmn_err(CE_WARN, - "XFS: Cannot mount filesystem with identical rtdev and ddev/logdev."); + xfs_warn(mp, + "Cannot mount filesystem with identical rtdev and ddev/logdev."); error = EINVAL; goto out_close_rtdev; } @@ -827,75 +816,6 @@ xfs_setup_devices( return 0; } -/* - * XFS AIL push thread support - */ -void -xfsaild_wakeup( - struct xfs_ail *ailp, - xfs_lsn_t threshold_lsn) -{ - /* only ever move the target forwards */ - if (XFS_LSN_CMP(threshold_lsn, ailp->xa_target) > 0) { - ailp->xa_target = threshold_lsn; - wake_up_process(ailp->xa_task); - } -} - -STATIC int -xfsaild( - void *data) -{ - struct xfs_ail *ailp = data; - xfs_lsn_t last_pushed_lsn = 0; - long tout = 0; /* milliseconds */ - - while (!kthread_should_stop()) { - /* - * for short sleeps indicating congestion, don't allow us to - * get woken early. Otherwise all we do is bang on the AIL lock - * without making progress. - */ - if (tout && tout <= 20) - __set_current_state(TASK_KILLABLE); - else - __set_current_state(TASK_INTERRUPTIBLE); - schedule_timeout(tout ? - msecs_to_jiffies(tout) : MAX_SCHEDULE_TIMEOUT); - - /* swsusp */ - try_to_freeze(); - - ASSERT(ailp->xa_mount->m_log); - if (XFS_FORCED_SHUTDOWN(ailp->xa_mount)) - continue; - - tout = xfsaild_push(ailp, &last_pushed_lsn); - } - - return 0; -} /* xfsaild */ - -int -xfsaild_start( - struct xfs_ail *ailp) -{ - ailp->xa_target = 0; - ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", - ailp->xa_mount->m_fsname); - if (IS_ERR(ailp->xa_task)) - return -PTR_ERR(ailp->xa_task); - return 0; -} - -void -xfsaild_stop( - struct xfs_ail *ailp) -{ - kthread_stop(ailp->xa_task); -} - - /* Catch misguided souls that try to use this interface on XFS */ STATIC struct inode * xfs_fs_alloc_inode( @@ -1089,7 +1009,7 @@ xfs_fs_write_inode( error = 0; goto out_unlock; } - error = xfs_iflush(ip, 0); + error = xfs_iflush(ip, SYNC_TRYLOCK); } out_unlock: @@ -1202,22 +1122,12 @@ xfs_fs_sync_fs( return -error; if (laptop_mode) { - int prev_sync_seq = mp->m_sync_seq; - /* * The disk must be active because we're syncing. * We schedule xfssyncd now (now that the disk is * active) instead of later (when it might not be). */ - wake_up_process(mp->m_sync_task); - /* - * We have to wait for the sync iteration to complete. - * If we don't, the disk activity caused by the sync - * will come after the sync is completed, and that - * triggers another sync from laptop mode. - */ - wait_event(mp->m_wait_single_sync_task, - mp->m_sync_seq != prev_sync_seq); + flush_delayed_work_sync(&mp->m_sync_work); } return 0; @@ -1345,8 +1255,8 @@ xfs_fs_remount( * options that we can't actually change. */ #if 0 - printk(KERN_INFO - "XFS: mount option \"%s\" not supported for remount\n", p); + xfs_info(mp, + "mount option \"%s\" not supported for remount\n", p); return -EINVAL; #else break; @@ -1367,8 +1277,7 @@ xfs_fs_remount( if (mp->m_update_flags) { error = xfs_mount_log_sb(mp, mp->m_update_flags); if (error) { - cmn_err(CE_WARN, - "XFS: failed to write sb changes"); + xfs_warn(mp, "failed to write sb changes"); return error; } mp->m_update_flags = 0; @@ -1452,15 +1361,15 @@ xfs_finish_flags( mp->m_logbsize = mp->m_sb.sb_logsunit; } else if (mp->m_logbsize > 0 && mp->m_logbsize < mp->m_sb.sb_logsunit) { - cmn_err(CE_WARN, - "XFS: logbuf size must be greater than or equal to log stripe size"); + xfs_warn(mp, + "logbuf size must be greater than or equal to log stripe size"); return XFS_ERROR(EINVAL); } } else { /* Fail a mount if the logbuf is larger than 32K */ if (mp->m_logbsize > XLOG_BIG_RECORD_BSIZE) { - cmn_err(CE_WARN, - "XFS: logbuf size for version 1 logs must be 16K or 32K"); + xfs_warn(mp, + "logbuf size for version 1 logs must be 16K or 32K"); return XFS_ERROR(EINVAL); } } @@ -1477,8 +1386,8 @@ xfs_finish_flags( * prohibit r/w mounts of read-only filesystems */ if ((mp->m_sb.sb_flags & XFS_SBF_READONLY) && !ronly) { - cmn_err(CE_WARN, - "XFS: cannot mount a read-only filesystem as read-write"); + xfs_warn(mp, + "cannot mount a read-only filesystem as read-write"); return XFS_ERROR(EROFS); } @@ -1502,9 +1411,6 @@ xfs_fs_fill_super( spin_lock_init(&mp->m_sb_lock); mutex_init(&mp->m_growlock); atomic_set(&mp->m_active_trans, 0); - INIT_LIST_HEAD(&mp->m_sync_list); - spin_lock_init(&mp->m_sync_lock); - init_waitqueue_head(&mp->m_wait_single_sync_task); mp->m_super = sb; sb->s_fs_info = mp; @@ -1551,10 +1457,14 @@ xfs_fs_fill_super( if (error) goto out_free_sb; - error = xfs_mountfs(mp); - if (error) - goto out_filestream_unmount; - + /* + * we must configure the block size in the superblock before we run the + * full mount process as the mount process can lookup and cache inodes. + * For the same reason we must also initialise the syncd and register + * the inode cache shrinker so that inodes can be reclaimed during + * operations like a quotacheck that iterate all inodes in the + * filesystem. + */ sb->s_magic = XFS_SB_MAGIC; sb->s_blocksize = mp->m_sb.sb_blocksize; sb->s_blocksize_bits = ffs(sb->s_blocksize) - 1; @@ -1562,6 +1472,16 @@ xfs_fs_fill_super( sb->s_time_gran = 1; set_posix_acl_flag(sb); + error = xfs_syncd_init(mp); + if (error) + goto out_filestream_unmount; + + xfs_inode_shrinker_register(mp); + + error = xfs_mountfs(mp); + if (error) + goto out_syncd_stop; + root = igrab(VFS_I(mp->m_rootip)); if (!root) { error = ENOENT; @@ -1577,14 +1497,11 @@ xfs_fs_fill_super( goto fail_vnrele; } - error = xfs_syncd_init(mp); - if (error) - goto fail_vnrele; - - xfs_inode_shrinker_register(mp); - return 0; + out_syncd_stop: + xfs_inode_shrinker_unregister(mp); + xfs_syncd_stop(mp); out_filestream_unmount: xfs_filestream_unmount(mp); out_free_sb: @@ -1608,6 +1525,9 @@ xfs_fs_fill_super( } fail_unmount: + xfs_inode_shrinker_unregister(mp); + xfs_syncd_stop(mp); + /* * Blow away any referenced inode in the filestreams cache. * This can and will cause log traffic as inodes go inactive @@ -1797,6 +1717,38 @@ xfs_destroy_zones(void) } STATIC int __init +xfs_init_workqueues(void) +{ + /* + * max_active is set to 8 to give enough concurency to allow + * multiple work operations on each CPU to run. This allows multiple + * filesystems to be running sync work concurrently, and scales with + * the number of CPUs in the system. + */ + xfs_syncd_wq = alloc_workqueue("xfssyncd", WQ_CPU_INTENSIVE, 8); + if (!xfs_syncd_wq) + goto out; + + xfs_ail_wq = alloc_workqueue("xfsail", WQ_CPU_INTENSIVE, 8); + if (!xfs_ail_wq) + goto out_destroy_syncd; + + return 0; + +out_destroy_syncd: + destroy_workqueue(xfs_syncd_wq); +out: + return -ENOMEM; +} + +STATIC void +xfs_destroy_workqueues(void) +{ + destroy_workqueue(xfs_ail_wq); + destroy_workqueue(xfs_syncd_wq); +} + +STATIC int __init init_xfs_fs(void) { int error; @@ -1811,10 +1763,14 @@ init_xfs_fs(void) if (error) goto out; - error = xfs_mru_cache_init(); + error = xfs_init_workqueues(); if (error) goto out_destroy_zones; + error = xfs_mru_cache_init(); + if (error) + goto out_destroy_wq; + error = xfs_filestream_init(); if (error) goto out_mru_cache_uninit; @@ -1831,6 +1787,10 @@ init_xfs_fs(void) if (error) goto out_cleanup_procfs; + error = xfs_init_workqueues(); + if (error) + goto out_sysctl_unregister; + vfs_initquota(); error = register_filesystem(&xfs_fs_type); @@ -1848,6 +1808,8 @@ init_xfs_fs(void) xfs_filestream_uninit(); out_mru_cache_uninit: xfs_mru_cache_uninit(); + out_destroy_wq: + xfs_destroy_workqueues(); out_destroy_zones: xfs_destroy_zones(); out: @@ -1864,6 +1826,7 @@ exit_xfs_fs(void) xfs_buf_terminate(); xfs_filestream_uninit(); xfs_mru_cache_uninit(); + xfs_destroy_workqueues(); xfs_destroy_zones(); } diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index e22f0057d21..e4f9c1b0836 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -22,6 +22,7 @@ #include "xfs_log.h" #include "xfs_inum.h" #include "xfs_trans.h" +#include "xfs_trans_priv.h" #include "xfs_sb.h" #include "xfs_ag.h" #include "xfs_mount.h" @@ -39,6 +40,8 @@ #include <linux/kthread.h> #include <linux/freezer.h> +struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ + /* * The inode lookup is done in batches to keep the amount of lock traffic and * radix tree lookups to a minimum. The batch size is a trade off between @@ -401,7 +404,7 @@ xfs_quiesce_fs( /* * Second stage of a quiesce. The data is already synced, now we have to take * care of the metadata. New transactions are already blocked, so we need to - * wait for any remaining transactions to drain out before proceding. + * wait for any remaining transactions to drain out before proceeding. */ void xfs_quiesce_attr( @@ -425,69 +428,18 @@ xfs_quiesce_attr( /* Push the superblock and write an unmount record */ error = xfs_log_sbcount(mp, 1); if (error) - xfs_fs_cmn_err(CE_WARN, mp, - "xfs_attr_quiesce: failed to log sb changes. " + xfs_warn(mp, "xfs_attr_quiesce: failed to log sb changes. " "Frozen image may not be consistent."); xfs_log_unmount_write(mp); xfs_unmountfs_writesb(mp); } -/* - * Enqueue a work item to be picked up by the vfs xfssyncd thread. - * Doing this has two advantages: - * - It saves on stack space, which is tight in certain situations - * - It can be used (with care) as a mechanism to avoid deadlocks. - * Flushing while allocating in a full filesystem requires both. - */ -STATIC void -xfs_syncd_queue_work( - struct xfs_mount *mp, - void *data, - void (*syncer)(struct xfs_mount *, void *), - struct completion *completion) -{ - struct xfs_sync_work *work; - - work = kmem_alloc(sizeof(struct xfs_sync_work), KM_SLEEP); - INIT_LIST_HEAD(&work->w_list); - work->w_syncer = syncer; - work->w_data = data; - work->w_mount = mp; - work->w_completion = completion; - spin_lock(&mp->m_sync_lock); - list_add_tail(&work->w_list, &mp->m_sync_list); - spin_unlock(&mp->m_sync_lock); - wake_up_process(mp->m_sync_task); -} - -/* - * Flush delayed allocate data, attempting to free up reserved space - * from existing allocations. At this point a new allocation attempt - * has failed with ENOSPC and we are in the process of scratching our - * heads, looking about for more room... - */ -STATIC void -xfs_flush_inodes_work( - struct xfs_mount *mp, - void *arg) -{ - struct inode *inode = arg; - xfs_sync_data(mp, SYNC_TRYLOCK); - xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); - iput(inode); -} - -void -xfs_flush_inodes( - xfs_inode_t *ip) +static void +xfs_syncd_queue_sync( + struct xfs_mount *mp) { - struct inode *inode = VFS_I(ip); - DECLARE_COMPLETION_ONSTACK(completion); - - igrab(inode); - xfs_syncd_queue_work(ip->i_mount, inode, xfs_flush_inodes_work, &completion); - wait_for_completion(&completion); - xfs_log_force(ip->i_mount, XFS_LOG_SYNC); + queue_delayed_work(xfs_syncd_wq, &mp->m_sync_work, + msecs_to_jiffies(xfs_syncd_centisecs * 10)); } /* @@ -497,9 +449,10 @@ xfs_flush_inodes( */ STATIC void xfs_sync_worker( - struct xfs_mount *mp, - void *unused) + struct work_struct *work) { + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_sync_work); int error; if (!(mp->m_flags & XFS_MOUNT_RDONLY)) { @@ -509,73 +462,106 @@ xfs_sync_worker( error = xfs_fs_log_dummy(mp); else xfs_log_force(mp, 0); - xfs_reclaim_inodes(mp, 0); error = xfs_qm_sync(mp, SYNC_TRYLOCK); + + /* start pushing all the metadata that is currently dirty */ + xfs_ail_push_all(mp->m_ail); } - mp->m_sync_seq++; - wake_up(&mp->m_wait_single_sync_task); + + /* queue us up again */ + xfs_syncd_queue_sync(mp); } -STATIC int -xfssyncd( - void *arg) +/* + * Queue a new inode reclaim pass if there are reclaimable inodes and there + * isn't a reclaim pass already in progress. By default it runs every 5s based + * on the xfs syncd work default of 30s. Perhaps this should have it's own + * tunable, but that can be done if this method proves to be ineffective or too + * aggressive. + */ +static void +xfs_syncd_queue_reclaim( + struct xfs_mount *mp) { - struct xfs_mount *mp = arg; - long timeleft; - xfs_sync_work_t *work, *n; - LIST_HEAD (tmp); - - set_freezable(); - timeleft = xfs_syncd_centisecs * msecs_to_jiffies(10); - for (;;) { - if (list_empty(&mp->m_sync_list)) - timeleft = schedule_timeout_interruptible(timeleft); - /* swsusp */ - try_to_freeze(); - if (kthread_should_stop() && list_empty(&mp->m_sync_list)) - break; - spin_lock(&mp->m_sync_lock); - /* - * We can get woken by laptop mode, to do a sync - - * that's the (only!) case where the list would be - * empty with time remaining. - */ - if (!timeleft || list_empty(&mp->m_sync_list)) { - if (!timeleft) - timeleft = xfs_syncd_centisecs * - msecs_to_jiffies(10); - INIT_LIST_HEAD(&mp->m_sync_work.w_list); - list_add_tail(&mp->m_sync_work.w_list, - &mp->m_sync_list); - } - list_splice_init(&mp->m_sync_list, &tmp); - spin_unlock(&mp->m_sync_lock); + /* + * We can have inodes enter reclaim after we've shut down the syncd + * workqueue during unmount, so don't allow reclaim work to be queued + * during unmount. + */ + if (!(mp->m_super->s_flags & MS_ACTIVE)) + return; - list_for_each_entry_safe(work, n, &tmp, w_list) { - (*work->w_syncer)(mp, work->w_data); - list_del(&work->w_list); - if (work == &mp->m_sync_work) - continue; - if (work->w_completion) - complete(work->w_completion); - kmem_free(work); - } + rcu_read_lock(); + if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_RECLAIM_TAG)) { + queue_delayed_work(xfs_syncd_wq, &mp->m_reclaim_work, + msecs_to_jiffies(xfs_syncd_centisecs / 6 * 10)); } + rcu_read_unlock(); +} - return 0; +/* + * This is a fast pass over the inode cache to try to get reclaim moving on as + * many inodes as possible in a short period of time. It kicks itself every few + * seconds, as well as being kicked by the inode cache shrinker when memory + * goes low. It scans as quickly as possible avoiding locked inodes or those + * already being flushed, and once done schedules a future pass. + */ +STATIC void +xfs_reclaim_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(to_delayed_work(work), + struct xfs_mount, m_reclaim_work); + + xfs_reclaim_inodes(mp, SYNC_TRYLOCK); + xfs_syncd_queue_reclaim(mp); +} + +/* + * Flush delayed allocate data, attempting to free up reserved space + * from existing allocations. At this point a new allocation attempt + * has failed with ENOSPC and we are in the process of scratching our + * heads, looking about for more room. + * + * Queue a new data flush if there isn't one already in progress and + * wait for completion of the flush. This means that we only ever have one + * inode flush in progress no matter how many ENOSPC events are occurring and + * so will prevent the system from bogging down due to every concurrent + * ENOSPC event scanning all the active inodes in the system for writeback. + */ +void +xfs_flush_inodes( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + queue_work(xfs_syncd_wq, &mp->m_flush_work); + flush_work_sync(&mp->m_flush_work); +} + +STATIC void +xfs_flush_worker( + struct work_struct *work) +{ + struct xfs_mount *mp = container_of(work, + struct xfs_mount, m_flush_work); + + xfs_sync_data(mp, SYNC_TRYLOCK); + xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT); } int xfs_syncd_init( struct xfs_mount *mp) { - mp->m_sync_work.w_syncer = xfs_sync_worker; - mp->m_sync_work.w_mount = mp; - mp->m_sync_work.w_completion = NULL; - mp->m_sync_task = kthread_run(xfssyncd, mp, "xfssyncd/%s", mp->m_fsname); - if (IS_ERR(mp->m_sync_task)) - return -PTR_ERR(mp->m_sync_task); + INIT_WORK(&mp->m_flush_work, xfs_flush_worker); + INIT_DELAYED_WORK(&mp->m_sync_work, xfs_sync_worker); + INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); + + xfs_syncd_queue_sync(mp); + xfs_syncd_queue_reclaim(mp); + return 0; } @@ -583,7 +569,9 @@ void xfs_syncd_stop( struct xfs_mount *mp) { - kthread_stop(mp->m_sync_task); + cancel_delayed_work_sync(&mp->m_sync_work); + cancel_delayed_work_sync(&mp->m_reclaim_work); + cancel_work_sync(&mp->m_flush_work); } void @@ -602,6 +590,10 @@ __xfs_inode_set_reclaim_tag( XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); spin_unlock(&ip->i_mount->m_perag_lock); + + /* schedule periodic background inode reclaim */ + xfs_syncd_queue_reclaim(ip->i_mount); + trace_xfs_perag_set_reclaim(ip->i_mount, pag->pag_agno, -1, _RET_IP_); } @@ -762,8 +754,10 @@ xfs_reclaim_inode( struct xfs_perag *pag, int sync_mode) { - int error = 0; + int error; +restart: + error = 0; xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) @@ -789,9 +783,31 @@ xfs_reclaim_inode( if (xfs_inode_clean(ip)) goto reclaim; - /* Now we have an inode that needs flushing */ - error = xfs_iflush(ip, sync_mode); + /* + * Now we have an inode that needs flushing. + * + * We do a nonblocking flush here even if we are doing a SYNC_WAIT + * reclaim as we can deadlock with inode cluster removal. + * xfs_ifree_cluster() can lock the inode buffer before it locks the + * ip->i_lock, and we are doing the exact opposite here. As a result, + * doing a blocking xfs_itobp() to get the cluster buffer will result + * in an ABBA deadlock with xfs_ifree_cluster(). + * + * As xfs_ifree_cluser() must gather all inodes that are active in the + * cache to mark them stale, if we hit this case we don't actually want + * to do IO here - we want the inode marked stale so we can simply + * reclaim it. Hence if we get an EAGAIN error on a SYNC_WAIT flush, + * just unlock the inode, back off and try again. Hopefully the next + * pass through will see the stale flag set on the inode. + */ + error = xfs_iflush(ip, SYNC_TRYLOCK | sync_mode); if (sync_mode & SYNC_WAIT) { + if (error == EAGAIN) { + xfs_iunlock(ip, XFS_ILOCK_EXCL); + /* backoff longer than in xfs_ifree_cluster */ + delay(2); + goto restart; + } xfs_iflock(ip); goto reclaim; } @@ -806,7 +822,7 @@ xfs_reclaim_inode( * pass on the error. */ if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { - xfs_fs_cmn_err(CE_WARN, ip->i_mount, + xfs_warn(ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); } @@ -994,7 +1010,13 @@ xfs_reclaim_inodes( } /* - * Shrinker infrastructure. + * Inode cache shrinker. + * + * When called we make sure that there is a background (fast) inode reclaim in + * progress, while we will throttle the speed of reclaim via doiing synchronous + * reclaim of inodes. That means if we come across dirty inodes, we wait for + * them to be cleaned, which we hope will not be very long due to the + * background walker having already kicked the IO off on those dirty inodes. */ static int xfs_reclaim_inode_shrink( @@ -1009,10 +1031,15 @@ xfs_reclaim_inode_shrink( mp = container_of(shrink, struct xfs_mount, m_inode_shrink); if (nr_to_scan) { + /* kick background reclaimer and push the AIL */ + xfs_syncd_queue_reclaim(mp); + xfs_ail_push_all(mp->m_ail); + if (!(gfp_mask & __GFP_FS)) return -1; - xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, + &nr_to_scan); /* terminate if we don't exhaust the scan */ if (nr_to_scan > 0) return -1; diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 32ba6628290..e3a6ad27415 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -32,6 +32,8 @@ typedef struct xfs_sync_work { #define SYNC_WAIT 0x0001 /* wait for i/o to complete */ #define SYNC_TRYLOCK 0x0002 /* only try to lock inodes */ +extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ + int xfs_syncd_init(struct xfs_mount *mp); void xfs_syncd_stop(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_sysctl.c b/fs/xfs/linux-2.6/xfs_sysctl.c index ee3cee097e7..ee2d2adaa43 100644 --- a/fs/xfs/linux-2.6/xfs_sysctl.c +++ b/fs/xfs/linux-2.6/xfs_sysctl.c @@ -37,7 +37,7 @@ xfs_stats_clear_proc_handler( ret = proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); if (!ret && write && *valp) { - printk("XFS Clearing xfsstats\n"); + xfs_notice(NULL, "Clearing xfsstats"); for_each_possible_cpu(c) { preempt_disable(); /* save vn_active, it's a universal truth! */ |