diff options
Diffstat (limited to 'fs/xfs/linux-2.6')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.c | 237 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_buf.h | 81 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_cred.h | 28 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_fs_subr.c | 31 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_globals.c | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_globals.h | 23 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_ioctl.c | 19 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_ioctl32.c | 5 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_ioctl32.h | 6 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_iops.c | 39 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_linux.h | 5 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_super.c | 27 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_super.h | 1 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.c | 413 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_sync.h | 4 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_trace.h | 5 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_version.h | 29 |
17 files changed, 447 insertions, 507 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 286e36e21da..ba5312802aa 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -188,8 +188,8 @@ _xfs_buf_initialize( atomic_set(&bp->b_hold, 1); init_completion(&bp->b_iowait); INIT_LIST_HEAD(&bp->b_list); - INIT_LIST_HEAD(&bp->b_hash_list); - init_MUTEX_LOCKED(&bp->b_sema); /* held, no waiters */ + RB_CLEAR_NODE(&bp->b_rbnode); + sema_init(&bp->b_sema, 0); /* held, no waiters */ XB_SET_OWNER(bp); bp->b_target = target; bp->b_file_offset = range_base; @@ -262,8 +262,6 @@ xfs_buf_free( { trace_xfs_buf_free(bp, _RET_IP_); - ASSERT(list_empty(&bp->b_hash_list)); - if (bp->b_flags & (_XBF_PAGE_CACHE|_XBF_PAGES)) { uint i; @@ -422,8 +420,10 @@ _xfs_buf_find( { xfs_off_t range_base; size_t range_length; - xfs_bufhash_t *hash; - xfs_buf_t *bp, *n; + struct xfs_perag *pag; + struct rb_node **rbp; + struct rb_node *parent; + xfs_buf_t *bp; range_base = (ioff << BBSHIFT); range_length = (isize << BBSHIFT); @@ -432,14 +432,37 @@ _xfs_buf_find( ASSERT(!(range_length < (1 << btp->bt_sshift))); ASSERT(!(range_base & (xfs_off_t)btp->bt_smask)); - hash = &btp->bt_hash[hash_long((unsigned long)ioff, btp->bt_hashshift)]; - - spin_lock(&hash->bh_lock); - - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (bp->b_file_offset == range_base && - bp->b_buffer_length == range_length) { + /* get tree root */ + pag = xfs_perag_get(btp->bt_mount, + xfs_daddr_to_agno(btp->bt_mount, ioff)); + + /* walk tree */ + spin_lock(&pag->pag_buf_lock); + rbp = &pag->pag_buf_tree.rb_node; + parent = NULL; + bp = NULL; + while (*rbp) { + parent = *rbp; + bp = rb_entry(parent, struct xfs_buf, b_rbnode); + + if (range_base < bp->b_file_offset) + rbp = &(*rbp)->rb_left; + else if (range_base > bp->b_file_offset) + rbp = &(*rbp)->rb_right; + else { + /* + * found a block offset match. If the range doesn't + * match, the only way this is allowed is if the buffer + * in the cache is stale and the transaction that made + * it stale has not yet committed. i.e. we are + * reallocating a busy extent. Skip this buffer and + * continue searching to the right for an exact match. + */ + if (bp->b_buffer_length != range_length) { + ASSERT(bp->b_flags & XBF_STALE); + rbp = &(*rbp)->rb_right; + continue; + } atomic_inc(&bp->b_hold); goto found; } @@ -449,17 +472,21 @@ _xfs_buf_find( if (new_bp) { _xfs_buf_initialize(new_bp, btp, range_base, range_length, flags); - new_bp->b_hash = hash; - list_add(&new_bp->b_hash_list, &hash->bh_list); + rb_link_node(&new_bp->b_rbnode, parent, rbp); + rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree); + /* the buffer keeps the perag reference until it is freed */ + new_bp->b_pag = pag; + spin_unlock(&pag->pag_buf_lock); } else { XFS_STATS_INC(xb_miss_locked); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); } - - spin_unlock(&hash->bh_lock); return new_bp; found: - spin_unlock(&hash->bh_lock); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); /* Attempt to get the semaphore without sleeping, * if this does not work then we need to drop the @@ -625,8 +652,7 @@ void xfs_buf_readahead( xfs_buftarg_t *target, xfs_off_t ioff, - size_t isize, - xfs_buf_flags_t flags) + size_t isize) { struct backing_dev_info *bdi; @@ -634,8 +660,42 @@ xfs_buf_readahead( if (bdi_read_congested(bdi)) return; - flags |= (XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD); - xfs_buf_read(target, ioff, isize, flags); + xfs_buf_read(target, ioff, isize, + XBF_TRYLOCK|XBF_ASYNC|XBF_READ_AHEAD|XBF_DONT_BLOCK); +} + +/* + * Read an uncached buffer from disk. Allocates and returns a locked + * buffer containing the disk contents or nothing. + */ +struct xfs_buf * +xfs_buf_read_uncached( + struct xfs_mount *mp, + struct xfs_buftarg *target, + xfs_daddr_t daddr, + size_t length, + int flags) +{ + xfs_buf_t *bp; + int error; + + bp = xfs_buf_get_uncached(target, length, flags); + if (!bp) + return NULL; + + /* set up the buffer for a read IO */ + xfs_buf_lock(bp); + XFS_BUF_SET_ADDR(bp, daddr); + XFS_BUF_READ(bp); + XFS_BUF_BUSY(bp); + + xfsbdstrat(mp, bp); + error = xfs_buf_iowait(bp); + if (error || bp->b_error) { + xfs_buf_relse(bp); + return NULL; + } + return bp; } xfs_buf_t * @@ -707,9 +767,10 @@ xfs_buf_associate_memory( } xfs_buf_t * -xfs_buf_get_noaddr( +xfs_buf_get_uncached( + struct xfs_buftarg *target, size_t len, - xfs_buftarg_t *target) + int flags) { unsigned long page_count = PAGE_ALIGN(len) >> PAGE_SHIFT; int error, i; @@ -725,7 +786,7 @@ xfs_buf_get_noaddr( goto fail_free_buf; for (i = 0; i < page_count; i++) { - bp->b_pages[i] = alloc_page(GFP_KERNEL); + bp->b_pages[i] = alloc_page(xb_to_gfp(flags)); if (!bp->b_pages[i]) goto fail_free_mem; } @@ -740,7 +801,7 @@ xfs_buf_get_noaddr( xfs_buf_unlock(bp); - trace_xfs_buf_get_noaddr(bp, _RET_IP_); + trace_xfs_buf_get_uncached(bp, _RET_IP_); return bp; fail_free_mem: @@ -774,29 +835,30 @@ void xfs_buf_rele( xfs_buf_t *bp) { - xfs_bufhash_t *hash = bp->b_hash; + struct xfs_perag *pag = bp->b_pag; trace_xfs_buf_rele(bp, _RET_IP_); - if (unlikely(!hash)) { + if (!pag) { ASSERT(!bp->b_relse); + ASSERT(RB_EMPTY_NODE(&bp->b_rbnode)); if (atomic_dec_and_test(&bp->b_hold)) xfs_buf_free(bp); return; } + ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode)); ASSERT(atomic_read(&bp->b_hold) > 0); - if (atomic_dec_and_lock(&bp->b_hold, &hash->bh_lock)) { + if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) { if (bp->b_relse) { atomic_inc(&bp->b_hold); - spin_unlock(&hash->bh_lock); - (*(bp->b_relse)) (bp); - } else if (bp->b_flags & XBF_FS_MANAGED) { - spin_unlock(&hash->bh_lock); + spin_unlock(&pag->pag_buf_lock); + bp->b_relse(bp); } else { ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); - list_del_init(&bp->b_hash_list); - spin_unlock(&hash->bh_lock); + rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); xfs_buf_free(bp); } } @@ -859,7 +921,7 @@ xfs_buf_lock( trace_xfs_buf_lock(bp, _RET_IP_); if (atomic_read(&bp->b_pin_count) && (bp->b_flags & XBF_STALE)) - xfs_log_force(bp->b_mount, 0); + xfs_log_force(bp->b_target->bt_mount, 0); if (atomic_read(&bp->b_io_remaining)) blk_run_address_space(bp->b_target->bt_mapping); down(&bp->b_sema); @@ -924,19 +986,7 @@ xfs_buf_iodone_work( xfs_buf_t *bp = container_of(work, xfs_buf_t, b_iodone_work); - /* - * We can get an EOPNOTSUPP to ordered writes. Here we clear the - * ordered flag and reissue them. Because we can't tell the higher - * layers directly that they should not issue ordered I/O anymore, they - * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion. - */ - if ((bp->b_error == EOPNOTSUPP) && - (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { - trace_xfs_buf_ordered_retry(bp, _RET_IP_); - bp->b_flags &= ~XBF_ORDERED; - bp->b_flags |= _XFS_BARRIER_FAILED; - xfs_buf_iorequest(bp); - } else if (bp->b_iodone) + if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); @@ -982,7 +1032,6 @@ xfs_bwrite( { int error; - bp->b_mount = mp; bp->b_flags |= XBF_WRITE; bp->b_flags &= ~(XBF_ASYNC | XBF_READ); @@ -1003,8 +1052,6 @@ xfs_bdwrite( { trace_xfs_buf_bdwrite(bp, _RET_IP_); - bp->b_mount = mp; - bp->b_flags &= ~XBF_READ; bp->b_flags |= (XBF_DELWRI | XBF_ASYNC); @@ -1013,7 +1060,7 @@ xfs_bdwrite( /* * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call biodone + * We attach the EIO error, muck with its flags, and call xfs_buf_ioend * so that the proper iodone callbacks get called. */ STATIC int @@ -1030,21 +1077,21 @@ xfs_bioerror( XFS_BUF_ERROR(bp, EIO); /* - * We're calling biodone, so delete XBF_DONE flag. + * We're calling xfs_buf_ioend, so delete XBF_DONE flag. */ XFS_BUF_UNREAD(bp); XFS_BUF_UNDELAYWRITE(bp); XFS_BUF_UNDONE(bp); XFS_BUF_STALE(bp); - xfs_biodone(bp); + xfs_buf_ioend(bp, 0); return EIO; } /* * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the biodone call. + * here ourselves, and avoiding the xfs_buf_ioend call. * This is meant for userdata errors; metadata bufs come with * iodone functions attached, so that we can track down errors. */ @@ -1093,7 +1140,7 @@ int xfs_bdstrat_cb( struct xfs_buf *bp) { - if (XFS_FORCED_SHUTDOWN(bp->b_mount)) { + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { trace_xfs_bdstrat_shut(bp, _RET_IP_); /* * Metadata write that didn't get logged but @@ -1195,7 +1242,7 @@ _xfs_buf_ioapply( if (bp->b_flags & XBF_ORDERED) { ASSERT(!(bp->b_flags & XBF_READ)); - rw = WRITE_BARRIER; + rw = WRITE_FLUSH_FUA; } else if (bp->b_flags & XBF_LOG_BUFFER) { ASSERT(!(bp->b_flags & XBF_READ_AHEAD)); bp->b_flags &= ~_XBF_RUN_QUEUES; @@ -1399,62 +1446,24 @@ xfs_buf_iomove( */ void xfs_wait_buftarg( - xfs_buftarg_t *btp) -{ - xfs_buf_t *bp, *n; - xfs_bufhash_t *hash; - uint i; - - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - hash = &btp->bt_hash[i]; -again: - spin_lock(&hash->bh_lock); - list_for_each_entry_safe(bp, n, &hash->bh_list, b_hash_list) { - ASSERT(btp == bp->b_target); - if (!(bp->b_flags & XBF_FS_MANAGED)) { - spin_unlock(&hash->bh_lock); - /* - * Catch superblock reference count leaks - * immediately - */ - BUG_ON(bp->b_bn == 0); - delay(100); - goto again; - } - } - spin_unlock(&hash->bh_lock); - } -} - -/* - * Allocate buffer hash table for a given target. - * For devices containing metadata (i.e. not the log/realtime devices) - * we need to allocate a much larger hash table. - */ -STATIC void -xfs_alloc_bufhash( - xfs_buftarg_t *btp, - int external) + struct xfs_buftarg *btp) { - unsigned int i; + struct xfs_perag *pag; + uint i; - btp->bt_hashshift = external ? 3 : 12; /* 8 or 4096 buckets */ - btp->bt_hash = kmem_zalloc_large((1 << btp->bt_hashshift) * - sizeof(xfs_bufhash_t)); - for (i = 0; i < (1 << btp->bt_hashshift); i++) { - spin_lock_init(&btp->bt_hash[i].bh_lock); - INIT_LIST_HEAD(&btp->bt_hash[i].bh_list); + for (i = 0; i < btp->bt_mount->m_sb.sb_agcount; i++) { + pag = xfs_perag_get(btp->bt_mount, i); + spin_lock(&pag->pag_buf_lock); + while (rb_first(&pag->pag_buf_tree)) { + spin_unlock(&pag->pag_buf_lock); + delay(100); + spin_lock(&pag->pag_buf_lock); + } + spin_unlock(&pag->pag_buf_lock); + xfs_perag_put(pag); } } -STATIC void -xfs_free_bufhash( - xfs_buftarg_t *btp) -{ - kmem_free_large(btp->bt_hash); - btp->bt_hash = NULL; -} - /* * buftarg list for delwrite queue processing */ @@ -1487,7 +1496,6 @@ xfs_free_buftarg( xfs_flush_buftarg(btp, 1); if (mp->m_flags & XFS_MOUNT_BARRIER) xfs_blkdev_issue_flush(btp); - xfs_free_bufhash(btp); iput(btp->bt_mapping->host); /* Unregister the buftarg first so that we don't get a @@ -1609,6 +1617,7 @@ out_error: xfs_buftarg_t * xfs_alloc_buftarg( + struct xfs_mount *mp, struct block_device *bdev, int external, const char *fsname) @@ -1617,6 +1626,7 @@ xfs_alloc_buftarg( btp = kmem_zalloc(sizeof(*btp), KM_SLEEP); + btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; if (xfs_setsize_buftarg_early(btp, bdev)) @@ -1625,7 +1635,6 @@ xfs_alloc_buftarg( goto error; if (xfs_alloc_delwrite_queue(btp, fsname)) goto error; - xfs_alloc_bufhash(btp, external); return btp; error: @@ -1916,7 +1925,7 @@ xfs_flush_buftarg( bp = list_first_entry(&wait_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); - xfs_iowait(bp); + xfs_buf_iowait(bp); xfs_buf_relse(bp); } } @@ -1933,7 +1942,7 @@ xfs_buf_init(void) goto out; xfslogd_workqueue = alloc_workqueue("xfslogd", - WQ_RESCUER | WQ_HIGHPRI, 1); + WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 2a05614f0b9..383a3f37cf9 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -51,7 +51,6 @@ typedef enum { #define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ #define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ #define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */ -#define XBF_FS_MANAGED (1 << 8) /* filesystem controls freeing memory */ #define XBF_ORDERED (1 << 11)/* use ordered writes */ #define XBF_READ_AHEAD (1 << 12)/* asynchronous read-ahead */ #define XBF_LOG_BUFFER (1 << 13)/* this is a buffer used for the log */ @@ -86,14 +85,6 @@ typedef enum { */ #define _XBF_PAGE_LOCKED (1 << 22) -/* - * If we try a barrier write, but it fails we have to communicate - * this to the upper layers. Unfortunately b_error gets overwritten - * when the buffer is re-issued so we have to add another flag to - * keep this information. - */ -#define _XFS_BARRIER_FAILED (1 << 23) - typedef unsigned int xfs_buf_flags_t; #define XFS_BUF_FLAGS \ @@ -104,7 +95,6 @@ typedef unsigned int xfs_buf_flags_t; { XBF_DONE, "DONE" }, \ { XBF_DELWRI, "DELWRI" }, \ { XBF_STALE, "STALE" }, \ - { XBF_FS_MANAGED, "FS_MANAGED" }, \ { XBF_ORDERED, "ORDERED" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ { XBF_LOCK, "LOCK" }, /* should never be set */\ @@ -114,8 +104,7 @@ typedef unsigned int xfs_buf_flags_t; { _XBF_PAGES, "PAGES" }, \ { _XBF_RUN_QUEUES, "RUN_QUEUES" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ - { _XBF_PAGE_LOCKED, "PAGE_LOCKED" }, \ - { _XFS_BARRIER_FAILED, "BARRIER_FAILED" } + { _XBF_PAGE_LOCKED, "PAGE_LOCKED" } typedef enum { @@ -132,14 +121,11 @@ typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; struct address_space *bt_mapping; + struct xfs_mount *bt_mount; unsigned int bt_bsize; unsigned int bt_sshift; size_t bt_smask; - /* per device buffer hash table */ - uint bt_hashshift; - xfs_bufhash_t *bt_hash; - /* per device delwri queue */ struct task_struct *bt_task; struct list_head bt_list; @@ -167,34 +153,41 @@ typedef int (*xfs_buf_bdstrat_t)(struct xfs_buf *); #define XB_PAGES 2 typedef struct xfs_buf { + /* + * first cacheline holds all the fields needed for an uncontended cache + * hit to be fully processed. The semaphore straddles the cacheline + * boundary, but the counter and lock sits on the first cacheline, + * which is the only bit that is touched if we hit the semaphore + * fast-path on locking. + */ + struct rb_node b_rbnode; /* rbtree node */ + xfs_off_t b_file_offset; /* offset in file */ + size_t b_buffer_length;/* size of buffer in bytes */ + atomic_t b_hold; /* reference count */ + xfs_buf_flags_t b_flags; /* status flags */ struct semaphore b_sema; /* semaphore for lockables */ - unsigned long b_queuetime; /* time buffer was queued */ - atomic_t b_pin_count; /* pin count */ + wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; - xfs_buf_flags_t b_flags; /* status flags */ - struct list_head b_hash_list; /* hash table list */ - xfs_bufhash_t *b_hash; /* hash table list start */ + struct xfs_perag *b_pag; /* contains rbtree root */ xfs_buftarg_t *b_target; /* buffer target (device) */ - atomic_t b_hold; /* reference count */ xfs_daddr_t b_bn; /* block number for I/O */ - xfs_off_t b_file_offset; /* offset in file */ - size_t b_buffer_length;/* size of buffer in bytes */ size_t b_count_desired;/* desired transfer size */ void *b_addr; /* virtual address of buffer */ struct work_struct b_iodone_work; - atomic_t b_io_remaining; /* #outstanding I/O requests */ xfs_buf_iodone_t b_iodone; /* I/O completion function */ xfs_buf_relse_t b_relse; /* releasing function */ struct completion b_iowait; /* queue for I/O waiters */ void *b_fspriv; void *b_fspriv2; - struct xfs_mount *b_mount; - unsigned short b_error; /* error code on I/O */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset in first page */ struct page **b_pages; /* array of page pointers */ struct page *b_page_array[XB_PAGES]; /* inline pages */ + unsigned long b_queuetime; /* time buffer was queued */ + atomic_t b_pin_count; /* pin count */ + atomic_t b_io_remaining; /* #outstanding I/O requests */ + unsigned int b_page_count; /* size of page array */ + unsigned int b_offset; /* page offset in first page */ + unsigned short b_error; /* error code on I/O */ #ifdef XFS_BUF_LOCK_TRACKING int b_last_holder; #endif @@ -213,11 +206,13 @@ extern xfs_buf_t *xfs_buf_read(xfs_buftarg_t *, xfs_off_t, size_t, xfs_buf_flags_t); extern xfs_buf_t *xfs_buf_get_empty(size_t, xfs_buftarg_t *); -extern xfs_buf_t *xfs_buf_get_noaddr(size_t, xfs_buftarg_t *); +extern xfs_buf_t *xfs_buf_get_uncached(struct xfs_buftarg *, size_t, int); extern int xfs_buf_associate_memory(xfs_buf_t *, void *, size_t); extern void xfs_buf_hold(xfs_buf_t *); -extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t, - xfs_buf_flags_t); +extern void xfs_buf_readahead(xfs_buftarg_t *, xfs_off_t, size_t); +struct xfs_buf *xfs_buf_read_uncached(struct xfs_mount *mp, + struct xfs_buftarg *target, + xfs_daddr_t daddr, size_t length, int flags); /* Releasing Buffers */ extern void xfs_buf_free(xfs_buf_t *); @@ -242,6 +237,8 @@ extern int xfs_buf_iorequest(xfs_buf_t *); extern int xfs_buf_iowait(xfs_buf_t *); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); +#define xfs_buf_zero(bp, off, len) \ + xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) static inline int xfs_buf_geterror(xfs_buf_t *bp) { @@ -276,8 +273,6 @@ extern void xfs_buf_terminate(void); XFS_BUF_DONE(bp); \ } while (0) -#define XFS_BUF_UNMANAGE(bp) ((bp)->b_flags &= ~XBF_FS_MANAGED) - #define XFS_BUF_DELAYWRITE(bp) ((bp)->b_flags |= XBF_DELWRI) #define XFS_BUF_UNDELAYWRITE(bp) xfs_buf_delwri_dequeue(bp) #define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI) @@ -356,25 +351,11 @@ static inline void xfs_buf_relse(xfs_buf_t *bp) xfs_buf_rele(bp); } -#define xfs_biodone(bp) xfs_buf_ioend(bp, 0) - -#define xfs_biomove(bp, off, len, data, rw) \ - xfs_buf_iomove((bp), (off), (len), (data), \ - ((rw) == XBF_WRITE) ? XBRW_WRITE : XBRW_READ) - -#define xfs_biozero(bp, off, len) \ - xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) - -#define xfs_iowait(bp) xfs_buf_iowait(bp) - -#define xfs_baread(target, rablkno, ralen) \ - xfs_buf_readahead((target), (rablkno), (ralen), XBF_DONT_BLOCK) - - /* * Handling of buftargs. */ -extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int, const char *); +extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *, + struct block_device *, int, const char *); extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); diff --git a/fs/xfs/linux-2.6/xfs_cred.h b/fs/xfs/linux-2.6/xfs_cred.h deleted file mode 100644 index 55bddf3b609..00000000000 --- a/fs/xfs/linux-2.6/xfs_cred.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_CRED_H__ -#define __XFS_CRED_H__ - -#include <linux/capability.h> - -/* - * Credentials - */ -typedef const struct cred cred_t; - -#endif /* __XFS_CRED_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_fs_subr.c b/fs/xfs/linux-2.6/xfs_fs_subr.c index 1f279b012f9..ed88ed16811 100644 --- a/fs/xfs/linux-2.6/xfs_fs_subr.c +++ b/fs/xfs/linux-2.6/xfs_fs_subr.c @@ -32,10 +32,9 @@ xfs_tosspages( xfs_off_t last, int fiopt) { - struct address_space *mapping = VFS_I(ip)->i_mapping; - - if (mapping->nrpages) - truncate_inode_pages(mapping, first); + /* can't toss partial tail pages, so mask them out */ + last &= ~(PAGE_SIZE - 1); + truncate_inode_pages_range(VFS_I(ip)->i_mapping, first, last - 1); } int @@ -50,12 +49,11 @@ xfs_flushinval_pages( trace_xfs_pagecache_inval(ip, first, last); - if (mapping->nrpages) { - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = filemap_write_and_wait(mapping); - if (!ret) - truncate_inode_pages(mapping, first); - } + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = filemap_write_and_wait_range(mapping, first, + last == -1 ? LLONG_MAX : last); + if (!ret) + truncate_inode_pages_range(mapping, first, last); return -ret; } @@ -71,10 +69,9 @@ xfs_flush_pages( int ret = 0; int ret2; - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - xfs_iflags_clear(ip, XFS_ITRUNCATED); - ret = -filemap_fdatawrite(mapping); - } + xfs_iflags_clear(ip, XFS_ITRUNCATED); + ret = -filemap_fdatawrite_range(mapping, first, + last == -1 ? LLONG_MAX : last); if (flags & XBF_ASYNC) return ret; ret2 = xfs_wait_on_pages(ip, first, last); @@ -91,7 +88,9 @@ xfs_wait_on_pages( { struct address_space *mapping = VFS_I(ip)->i_mapping; - if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) - return -filemap_fdatawait(mapping); + if (mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK)) { + return -filemap_fdatawait_range(mapping, first, + last == -1 ? ip->i_size - 1 : last); + } return 0; } diff --git a/fs/xfs/linux-2.6/xfs_globals.c b/fs/xfs/linux-2.6/xfs_globals.c index 2ae8b1ccb02..76e81cff70b 100644 --- a/fs/xfs/linux-2.6/xfs_globals.c +++ b/fs/xfs/linux-2.6/xfs_globals.c @@ -16,7 +16,6 @@ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA */ #include "xfs.h" -#include "xfs_cred.h" #include "xfs_sysctl.h" /* diff --git a/fs/xfs/linux-2.6/xfs_globals.h b/fs/xfs/linux-2.6/xfs_globals.h deleted file mode 100644 index 69f71caf061..00000000000 --- a/fs/xfs/linux-2.6/xfs_globals.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_GLOBALS_H__ -#define __XFS_GLOBALS_H__ - -extern uint64_t xfs_panic_mask; /* set to cause more panics */ - -#endif /* __XFS_GLOBALS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 3b9e626f7cd..2ea238f6d38 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -790,7 +790,7 @@ xfs_ioc_fsgetxattr( xfs_ilock(ip, XFS_ILOCK_SHARED); fa.fsx_xflags = xfs_ip2xflags(ip); fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog; - fa.fsx_projid = ip->i_d.di_projid; + fa.fsx_projid = xfs_get_projid(ip); if (attr) { if (ip->i_afp) { @@ -909,10 +909,10 @@ xfs_ioctl_setattr( return XFS_ERROR(EIO); /* - * Disallow 32bit project ids because on-disk structure - * is 16bit only. + * Disallow 32bit project ids when projid32bit feature is not enabled. */ - if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1)) + if ((mask & FSX_PROJID) && (fa->fsx_projid > (__uint16_t)-1) && + !xfs_sb_version_hasprojid32bit(&ip->i_mount->m_sb)) return XFS_ERROR(EINVAL); /* @@ -961,7 +961,7 @@ xfs_ioctl_setattr( if (mask & FSX_PROJID) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp) && - ip->i_d.di_projid != fa->fsx_projid) { + xfs_get_projid(ip) != fa->fsx_projid) { ASSERT(tp); code = xfs_qm_vop_chown_reserve(tp, ip, udqp, gdqp, capable(CAP_FOWNER) ? @@ -1063,12 +1063,12 @@ xfs_ioctl_setattr( * Change the ownerships and register quota modifications * in the transaction. */ - if (ip->i_d.di_projid != fa->fsx_projid) { + if (xfs_get_projid(ip) != fa->fsx_projid) { if (XFS_IS_QUOTA_RUNNING(mp) && XFS_IS_PQUOTA_ON(mp)) { olddquot = xfs_qm_vop_chown(tp, ip, &ip->i_gdquot, gdqp); } - ip->i_d.di_projid = fa->fsx_projid; + xfs_set_projid(ip, fa->fsx_projid); /* * We may have to rev the inode as well as @@ -1088,8 +1088,8 @@ xfs_ioctl_setattr( xfs_diflags_to_linux(ip); } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); XFS_STATS_INC(xs_ig_attrchg); @@ -1301,7 +1301,8 @@ xfs_file_ioctl( case XFS_IOC_ALLOCSP64: case XFS_IOC_FREESP64: case XFS_IOC_RESVSP64: - case XFS_IOC_UNRESVSP64: { + case XFS_IOC_UNRESVSP64: + case XFS_IOC_ZERO_RANGE: { xfs_flock64_t bf; if (copy_from_user(&bf, arg, sizeof(bf))) diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index 6c83f7f62dc..b3486dfa552 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -164,7 +164,8 @@ xfs_ioctl32_bstat_copyin( get_user(bstat->bs_extsize, &bstat32->bs_extsize) || get_user(bstat->bs_extents, &bstat32->bs_extents) || get_user(bstat->bs_gen, &bstat32->bs_gen) || - get_user(bstat->bs_projid, &bstat32->bs_projid) || + get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || + get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || get_user(bstat->bs_aextents, &bstat32->bs_aextents)) @@ -218,6 +219,7 @@ xfs_bulkstat_one_fmt_compat( put_user(buffer->bs_extents, &p32->bs_extents) || put_user(buffer->bs_gen, &p32->bs_gen) || put_user(buffer->bs_projid, &p32->bs_projid) || + put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) @@ -574,6 +576,7 @@ xfs_file_compat_ioctl( case XFS_IOC_FSGEOMETRY_V1: case XFS_IOC_FSGROWFSDATA: case XFS_IOC_FSGROWFSRT: + case XFS_IOC_ZERO_RANGE: return xfs_file_ioctl(filp, cmd, p); #else case XFS_IOC_ALLOCSP_32: diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.h b/fs/xfs/linux-2.6/xfs_ioctl32.h index 1024c4f8ba0..08b605792a9 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.h +++ b/fs/xfs/linux-2.6/xfs_ioctl32.h @@ -65,8 +65,10 @@ typedef struct compat_xfs_bstat { __s32 bs_extsize; /* extent size */ __s32 bs_extents; /* number of extents */ __u32 bs_gen; /* generation count */ - __u16 bs_projid; /* project id */ - unsigned char bs_pad[14]; /* pad space, unused */ + __u16 bs_projid_lo; /* lower part of project id */ +#define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_projid_hi; /* high part of project id */ + unsigned char bs_pad[12]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index b1fc2a6bfe8..ec858e09d54 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -95,41 +95,6 @@ xfs_mark_inode_dirty( } /* - * Change the requested timestamp in the given inode. - * We don't lock across timestamp updates, and we don't log them but - * we do record the fact that there is dirty information in core. - */ -void -xfs_ichgtime( - xfs_inode_t *ip, - int flags) -{ - struct inode *inode = VFS_I(ip); - timespec_t tv; - int sync_it = 0; - - tv = current_fs_time(inode->i_sb); - - if ((flags & XFS_ICHGTIME_MOD) && - !timespec_equal(&inode->i_mtime, &tv)) { - inode->i_mtime = tv; - sync_it = 1; - } - if ((flags & XFS_ICHGTIME_CHG) && - !timespec_equal(&inode->i_ctime, &tv)) { - inode->i_ctime = tv; - sync_it = 1; - } - - /* - * Update complete - now make sure everyone knows that the inode - * is dirty. - */ - if (sync_it) - xfs_mark_inode_dirty_sync(ip); -} - -/* * Hook in SELinux. This is not quite correct yet, what we really need * here (as we do for default ACLs) is a mechanism by which creation of * these attrs can be journalled at inode creation time (along with the @@ -224,7 +189,7 @@ xfs_vn_mknod( } xfs_dentry_to_name(&name, dentry); - error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip, NULL); + error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip); if (unlikely(error)) goto out_free_acl; @@ -397,7 +362,7 @@ xfs_vn_symlink( (irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO); xfs_dentry_to_name(&name, dentry); - error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip, NULL); + error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip); if (unlikely(error)) goto out; diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 2fa0bd9ebc7..214ddd71ff7 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -71,6 +71,7 @@ #include <linux/random.h> #include <linux/ctype.h> #include <linux/writeback.h> +#include <linux/capability.h> #include <asm/page.h> #include <asm/div64.h> @@ -79,14 +80,12 @@ #include <asm/byteorder.h> #include <asm/unaligned.h> -#include <xfs_cred.h> #include <xfs_vnode.h> #include <xfs_stats.h> #include <xfs_sysctl.h> #include <xfs_iops.h> #include <xfs_aops.h> #include <xfs_super.h> -#include <xfs_globals.h> #include <xfs_buf.h> /* @@ -144,7 +143,7 @@ #define SYNCHRONIZE() barrier() #define __return_address __builtin_return_address(0) -#define dfltprid 0 +#define XFS_PROJID_DEFAULT 0 #define MAXPATHLEN 1024 #define MIN(a,b) (min(a,b)) diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index a4e07974955..ab31ce5aeaf 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -44,7 +44,6 @@ #include "xfs_buf_item.h" #include "xfs_utils.h" #include "xfs_vnodeops.h" -#include "xfs_version.h" #include "xfs_log_priv.h" #include "xfs_trans_priv.h" #include "xfs_filestream.h" @@ -645,7 +644,7 @@ xfs_barrier_test( XFS_BUF_ORDERED(sbp); xfsbdstrat(mp, sbp); - error = xfs_iowait(sbp); + error = xfs_buf_iowait(sbp); /* * Clear all the flags we set and possible error state in the @@ -693,8 +692,7 @@ void xfs_blkdev_issue_flush( xfs_buftarg_t *buftarg) { - blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL, - BLKDEV_IFL_WAIT); + blkdev_issue_flush(buftarg->bt_bdev, GFP_KERNEL, NULL); } STATIC void @@ -758,18 +756,20 @@ xfs_open_devices( * Setup xfs_mount buffer target pointers */ error = ENOMEM; - mp->m_ddev_targp = xfs_alloc_buftarg(ddev, 0, mp->m_fsname); + mp->m_ddev_targp = xfs_alloc_buftarg(mp, ddev, 0, mp->m_fsname); if (!mp->m_ddev_targp) goto out_close_rtdev; if (rtdev) { - mp->m_rtdev_targp = xfs_alloc_buftarg(rtdev, 1, mp->m_fsname); + mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev, 1, + mp->m_fsname); if (!mp->m_rtdev_targp) goto out_free_ddev_targ; } if (logdev && logdev != ddev) { - mp->m_logdev_targp = xfs_alloc_buftarg(logdev, 1, mp->m_fsname); + mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev, 1, + mp->m_fsname); if (!mp->m_logdev_targp) goto out_free_rtdev_targ; } else { @@ -972,12 +972,7 @@ xfs_fs_inode_init_once( /* * Dirty the XFS inode when mark_inode_dirty_sync() is called so that - * we catch unlogged VFS level updates to the inode. Care must be taken - * here - the transaction code calls mark_inode_dirty_sync() to mark the - * VFS inode dirty in a transaction and clears the i_update_core field; - * it must clear the field after calling mark_inode_dirty_sync() to - * correctly indicate that the dirty state has been propagated into the - * inode log item. + * we catch unlogged VFS level updates to the inode. * * We need the barrier() to maintain correct ordering between unlogged * updates and the transaction commit code that clears the i_update_core @@ -1521,8 +1516,9 @@ xfs_fs_fill_super( if (error) goto out_free_fsname; - if (xfs_icsb_init_counters(mp)) - mp->m_flags |= XFS_MOUNT_NO_PERCPU_SB; + error = xfs_icsb_init_counters(mp); + if (error) + goto out_close_devices; error = xfs_readsb(mp, flags); if (error) @@ -1583,6 +1579,7 @@ xfs_fs_fill_super( xfs_freesb(mp); out_destroy_counters: xfs_icsb_destroy_counters(mp); + out_close_devices: xfs_close_devices(mp); out_free_fsname: xfs_free_fsname(mp); diff --git a/fs/xfs/linux-2.6/xfs_super.h b/fs/xfs/linux-2.6/xfs_super.h index 1ef4a4d2d99..50a3266c999 100644 --- a/fs/xfs/linux-2.6/xfs_super.h +++ b/fs/xfs/linux-2.6/xfs_super.h @@ -62,6 +62,7 @@ extern void xfs_qm_exit(void); # define XFS_DBG_STRING "no debug" #endif +#define XFS_VERSION_STRING "SGI XFS" #define XFS_BUILD_OPTIONS XFS_ACL_STRING \ XFS_SECURITY_STRING \ XFS_REALTIME_STRING \ diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 81976ffed7d..37d33254981 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -39,42 +39,39 @@ #include <linux/kthread.h> #include <linux/freezer.h> +/* + * The inode lookup is done in batches to keep the amount of lock traffic and + * radix tree lookups to a minimum. The batch size is a trade off between + * lookup reduction and stack usage. This is in the reclaim path, so we can't + * be too greedy. + */ +#define XFS_LOOKUP_BATCH 32 -STATIC xfs_inode_t * -xfs_inode_ag_lookup( - struct xfs_mount *mp, - struct xfs_perag *pag, - uint32_t *first_index, - int tag) +STATIC int +xfs_inode_ag_walk_grab( + struct xfs_inode *ip) { - int nr_found; - struct xfs_inode *ip; + struct inode *inode = VFS_I(ip); - /* - * use a gang lookup to find the next inode in the tree - * as the tree is sparse and a gang lookup walks to find - * the number of objects requested. - */ - if (tag == XFS_ICI_NO_TAG) { - nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, - (void **)&ip, *first_index, 1); - } else { - nr_found = radix_tree_gang_lookup_tag(&pag->pag_ici_root, - (void **)&ip, *first_index, 1, tag); + /* nothing to sync during shutdown */ + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return EFSCORRUPTED; + + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + return ENOENT; + + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + return ENOENT; + + if (is_bad_inode(inode)) { + IRELE(ip); + return ENOENT; } - if (!nr_found) - return NULL; - /* - * Update the index for the next lookup. Catch overflows - * into the next AG range which can occur if we have inodes - * in the last block of the AG and we are currently - * pointing to the last inode. - */ - *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); - if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - return NULL; - return ip; + /* inode is valid */ + return 0; } STATIC int @@ -83,49 +80,75 @@ xfs_inode_ag_walk( struct xfs_perag *pag, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, - int tag, - int exclusive, - int *nr_to_scan) + int flags) { uint32_t first_index; int last_error = 0; int skipped; + int done; + int nr_found; restart: + done = 0; skipped = 0; first_index = 0; + nr_found = 0; do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; int error = 0; - xfs_inode_t *ip; + int i; - if (exclusive) - write_lock(&pag->pag_ici_lock); - else - read_lock(&pag->pag_ici_lock); - ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); - if (!ip) { - if (exclusive) - write_unlock(&pag->pag_ici_lock); - else - read_unlock(&pag->pag_ici_lock); + read_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH); + if (!nr_found) { + read_unlock(&pag->pag_ici_lock); break; } - /* execute releases pag->pag_ici_lock */ - error = execute(ip, pag, flags); - if (error == EAGAIN) { - skipped++; - continue; + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_inode_ag_walk_grab(ip)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch overflows + * into the next AG range which can occur if we have inodes + * in the last block of the AG and we are currently + * pointing to the last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + read_unlock(&pag->pag_ici_lock); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = execute(batch[i], pag, flags); + IRELE(batch[i]); + if (error == EAGAIN) { + skipped++; + continue; + } + if (error && last_error != EFSCORRUPTED) + last_error = error; } - if (error) - last_error = error; /* bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) break; - } while ((*nr_to_scan)--); + } while (nr_found && !done); if (skipped) { delay(1); @@ -134,110 +157,32 @@ restart: return last_error; } -/* - * Select the next per-ag structure to iterate during the walk. The reclaim - * walk is optimised only to walk AGs with reclaimable inodes in them. - */ -static struct xfs_perag * -xfs_inode_ag_iter_next_pag( - struct xfs_mount *mp, - xfs_agnumber_t *first, - int tag) -{ - struct xfs_perag *pag = NULL; - - if (tag == XFS_ICI_RECLAIM_TAG) { - int found; - int ref; - - spin_lock(&mp->m_perag_lock); - found = radix_tree_gang_lookup_tag(&mp->m_perag_tree, - (void **)&pag, *first, 1, tag); - if (found <= 0) { - spin_unlock(&mp->m_perag_lock); - return NULL; - } - *first = pag->pag_agno + 1; - /* open coded pag reference increment */ - ref = atomic_inc_return(&pag->pag_ref); - spin_unlock(&mp->m_perag_lock); - trace_xfs_perag_get_reclaim(mp, pag->pag_agno, ref, _RET_IP_); - } else { - pag = xfs_perag_get(mp, *first); - (*first)++; - } - return pag; -} - int xfs_inode_ag_iterator( struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, - int tag, - int exclusive, - int *nr_to_scan) + int flags) { struct xfs_perag *pag; int error = 0; int last_error = 0; xfs_agnumber_t ag; - int nr; - nr = nr_to_scan ? *nr_to_scan : INT_MAX; ag = 0; - while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, tag))) { - error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, - exclusive, &nr); + while ((pag = xfs_perag_get(mp, ag))) { + ag = pag->pag_agno + 1; + error = xfs_inode_ag_walk(mp, pag, execute, flags); xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) break; } - if (nr <= 0) - break; } - if (nr_to_scan) - *nr_to_scan = nr; return XFS_ERROR(last_error); } -/* must be called with pag_ici_lock held and releases it */ -int -xfs_sync_inode_valid( - struct xfs_inode *ip, - struct xfs_perag *pag) -{ - struct inode *inode = VFS_I(ip); - int error = EFSCORRUPTED; - - /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) - goto out_unlock; - - /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ - error = ENOENT; - if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) - goto out_unlock; - - /* If we can't grab the inode, it must on it's way to reclaim. */ - if (!igrab(inode)) - goto out_unlock; - - if (is_bad_inode(inode)) { - IRELE(ip); - goto out_unlock; - } - - /* inode is valid */ - error = 0; -out_unlock: - read_unlock(&pag->pag_ici_lock); - return error; -} - STATIC int xfs_sync_inode_data( struct xfs_inode *ip, @@ -248,10 +193,6 @@ xfs_sync_inode_data( struct address_space *mapping = inode->i_mapping; int error = 0; - error = xfs_sync_inode_valid(ip, pag); - if (error) - return error; - if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) goto out_wait; @@ -268,7 +209,6 @@ xfs_sync_inode_data( out_wait: if (flags & SYNC_WAIT) xfs_ioend_wait(ip); - IRELE(ip); return error; } @@ -280,10 +220,6 @@ xfs_sync_inode_attr( { int error = 0; - error = xfs_sync_inode_valid(ip, pag); - if (error) - return error; - xfs_ilock(ip, XFS_ILOCK_SHARED); if (xfs_inode_clean(ip)) goto out_unlock; @@ -302,7 +238,6 @@ xfs_sync_inode_attr( out_unlock: xfs_iunlock(ip, XFS_ILOCK_SHARED); - IRELE(ip); return error; } @@ -318,8 +253,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); - error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG, 0, NULL); + error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags); if (error) return XFS_ERROR(error); @@ -337,8 +271,7 @@ xfs_sync_attr( { ASSERT((flags & ~SYNC_WAIT) == 0); - return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG, 0, NULL); + return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags); } STATIC int @@ -698,6 +631,43 @@ __xfs_inode_clear_reclaim_tag( } /* + * Grab the inode for reclaim exclusively. + * Return 0 if we grabbed it, non-zero otherwise. + */ +STATIC int +xfs_reclaim_inode_grab( + struct xfs_inode *ip, + int flags) +{ + + /* + * do some unlocked checks first to avoid unnecceary lock traffic. + * The first is a flush lock check, the second is a already in reclaim + * check. Only do these checks if we are not going to block on locks. + */ + if ((flags & SYNC_TRYLOCK) && + (!ip->i_flush.done || __xfs_iflags_test(ip, XFS_IRECLAIM))) { + return 1; + } + + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + */ + spin_lock(&ip->i_flags_lock); + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* ignore as it is already under reclaim */ + spin_unlock(&ip->i_flags_lock); + return 1; + } + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + return 0; +} + +/* * Inodes in different states need to be treated differently, and the return * value of xfs_iflush is not sufficient to get this right. The following table * lists the inode states and the reclaim actions necessary for non-blocking @@ -755,23 +725,6 @@ xfs_reclaim_inode( { int error = 0; - /* - * The radix tree lock here protects a thread in xfs_iget from racing - * with us starting reclaim on the inode. Once we have the - * XFS_IRECLAIM flag set it will not touch us. - */ - spin_lock(&ip->i_flags_lock); - ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); - if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { - /* ignore as it is already under reclaim */ - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - return 0; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_ilock(ip, XFS_ILOCK_EXCL); if (!xfs_iflock_nowait(ip)) { if (!(sync_mode & SYNC_WAIT)) @@ -868,13 +821,126 @@ reclaim: } +/* + * Walk the AGs and reclaim the inodes in them. Even if the filesystem is + * corrupted, we still want to try to reclaim all the inodes. If we don't, + * then a shut down during filesystem unmount reclaim walk leak all the + * unreclaimed inodes. + */ +int +xfs_reclaim_inodes_ag( + struct xfs_mount *mp, + int flags, + int *nr_to_scan) +{ + struct xfs_perag *pag; + int error = 0; + int last_error = 0; + xfs_agnumber_t ag; + int trylock = flags & SYNC_TRYLOCK; + int skipped; + +restart: + ag = 0; + skipped = 0; + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + unsigned long first_index = 0; + int done = 0; + int nr_found = 0; + + ag = pag->pag_agno + 1; + + if (trylock) { + if (!mutex_trylock(&pag->pag_ici_reclaim_lock)) { + skipped++; + continue; + } + first_index = pag->pag_ici_reclaim_cursor; + } else + mutex_lock(&pag->pag_ici_reclaim_lock); + + do { + struct xfs_inode *batch[XFS_LOOKUP_BATCH]; + int i; + + write_lock(&pag->pag_ici_lock); + nr_found = radix_tree_gang_lookup_tag( + &pag->pag_ici_root, + (void **)batch, first_index, + XFS_LOOKUP_BATCH, + XFS_ICI_RECLAIM_TAG); + if (!nr_found) { + write_unlock(&pag->pag_ici_lock); + break; + } + + /* + * Grab the inodes before we drop the lock. if we found + * nothing, nr == 0 and the loop will be skipped. + */ + for (i = 0; i < nr_found; i++) { + struct xfs_inode *ip = batch[i]; + + if (done || xfs_reclaim_inode_grab(ip, flags)) + batch[i] = NULL; + + /* + * Update the index for the next lookup. Catch + * overflows into the next AG range which can + * occur if we have inodes in the last block of + * the AG and we are currently pointing to the + * last inode. + */ + first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); + if (first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) + done = 1; + } + + /* unlock now we've grabbed the inodes. */ + write_unlock(&pag->pag_ici_lock); + + for (i = 0; i < nr_found; i++) { + if (!batch[i]) + continue; + error = xfs_reclaim_inode(batch[i], pag, flags); + if (error && last_error != EFSCORRUPTED) + last_error = error; + } + + *nr_to_scan -= XFS_LOOKUP_BATCH; + + } while (nr_found && !done && *nr_to_scan > 0); + + if (trylock && !done) + pag->pag_ici_reclaim_cursor = first_index; + else + pag->pag_ici_reclaim_cursor = 0; + mutex_unlock(&pag->pag_ici_reclaim_lock); + xfs_perag_put(pag); + } + + /* + * if we skipped any AG, and we still have scan count remaining, do + * another pass this time using blocking reclaim semantics (i.e + * waiting on the reclaim locks and ignoring the reclaim cursors). This + * ensure that when we get more reclaimers than AGs we block rather + * than spin trying to execute reclaim. + */ + if (trylock && skipped && *nr_to_scan > 0) { + trylock = 0; + goto restart; + } + return XFS_ERROR(last_error); +} + int xfs_reclaim_inodes( xfs_mount_t *mp, int mode) { - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, - XFS_ICI_RECLAIM_TAG, 1, NULL); + int nr_to_scan = INT_MAX; + + return xfs_reclaim_inodes_ag(mp, mode, &nr_to_scan); } /* @@ -896,17 +962,16 @@ xfs_reclaim_inode_shrink( if (!(gfp_mask & __GFP_FS)) return -1; - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, - XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); - /* if we don't exhaust the scan, don't bother coming back */ + xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK, &nr_to_scan); + /* terminate if we don't exhaust the scan */ if (nr_to_scan > 0) return -1; } reclaimable = 0; ag = 0; - while ((pag = xfs_inode_ag_iter_next_pag(mp, &ag, - XFS_ICI_RECLAIM_TAG))) { + while ((pag = xfs_perag_get_tag(mp, ag, XFS_ICI_RECLAIM_TAG))) { + ag = pag->pag_agno + 1; reclaimable += pag->pag_ici_reclaimable; xfs_perag_put(pag); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index fe78726196f..32ba6628290 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -47,10 +47,10 @@ void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); -int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); +int xfs_sync_inode_grab(struct xfs_inode *ip); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag, int write_lock, int *nr_to_scan); + int flags); void xfs_inode_shrinker_register(struct xfs_mount *mp); void xfs_inode_shrinker_unregister(struct xfs_mount *mp); diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index be5dffd282a..acef2e98c59 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -124,7 +124,7 @@ DEFINE_EVENT(xfs_perag_class, name, \ unsigned long caller_ip), \ TP_ARGS(mp, agno, refcount, caller_ip)) DEFINE_PERAG_REF_EVENT(xfs_perag_get); -DEFINE_PERAG_REF_EVENT(xfs_perag_get_reclaim); +DEFINE_PERAG_REF_EVENT(xfs_perag_get_tag); DEFINE_PERAG_REF_EVENT(xfs_perag_put); DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim); DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim); @@ -325,13 +325,12 @@ DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); DEFINE_BUF_EVENT(xfs_buf_cond_lock); DEFINE_BUF_EVENT(xfs_buf_unlock); -DEFINE_BUF_EVENT(xfs_buf_ordered_retry); DEFINE_BUF_EVENT(xfs_buf_iowait); DEFINE_BUF_EVENT(xfs_buf_iowait_done); DEFINE_BUF_EVENT(xfs_buf_delwri_queue); DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); DEFINE_BUF_EVENT(xfs_buf_delwri_split); -DEFINE_BUF_EVENT(xfs_buf_get_noaddr); +DEFINE_BUF_EVENT(xfs_buf_get_uncached); DEFINE_BUF_EVENT(xfs_bdstrat_shut); DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); diff --git a/fs/xfs/linux-2.6/xfs_version.h b/fs/xfs/linux-2.6/xfs_version.h deleted file mode 100644 index f8d279d7563..00000000000 --- a/fs/xfs/linux-2.6/xfs_version.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright (c) 2001-2002,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_VERSION_H__ -#define __XFS_VERSION_H__ - -/* - * Dummy file that can contain a timestamp to put into the - * XFS init string, to help users keep track of what they're - * running - */ - -#define XFS_VERSION_STRING "SGI XFS" - -#endif /* __XFS_VERSION_H__ */ |