diff options
Diffstat (limited to 'fs')
-rw-r--r-- | fs/buffer.c | 33 | ||||
-rw-r--r-- | fs/cifs/file.c | 2 | ||||
-rw-r--r-- | fs/compat.c | 10 | ||||
-rw-r--r-- | fs/direct-io.c | 323 | ||||
-rw-r--r-- | fs/exec.c | 2 | ||||
-rw-r--r-- | fs/fcntl.c | 5 | ||||
-rw-r--r-- | fs/file.c | 255 | ||||
-rw-r--r-- | fs/jbd/transaction.c | 2 | ||||
-rw-r--r-- | fs/jfs/jfs_filsys.h | 42 | ||||
-rw-r--r-- | fs/open.c | 3 | ||||
-rw-r--r-- | fs/proc/base.c | 24 | ||||
-rw-r--r-- | fs/proc/proc_misc.c | 12 | ||||
-rw-r--r-- | fs/select.c | 10 | ||||
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 2 |
14 files changed, 321 insertions, 404 deletions
diff --git a/fs/buffer.c b/fs/buffer.c index 517860f2d75..d1f1b54d310 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -35,6 +35,7 @@ #include <linux/hash.h> #include <linux/suspend.h> #include <linux/buffer_head.h> +#include <linux/task_io_accounting_ops.h> #include <linux/bio.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -724,20 +725,21 @@ int __set_page_dirty_buffers(struct page *page) } spin_unlock(&mapping->private_lock); - if (!TestSetPageDirty(page)) { - write_lock_irq(&mapping->tree_lock); - if (page->mapping) { /* Race with truncate? */ - if (mapping_cap_account_dirty(mapping)) - __inc_zone_page_state(page, NR_FILE_DIRTY); - radix_tree_tag_set(&mapping->page_tree, - page_index(page), - PAGECACHE_TAG_DIRTY); + if (TestSetPageDirty(page)) + return 0; + + write_lock_irq(&mapping->tree_lock); + if (page->mapping) { /* Race with truncate? */ + if (mapping_cap_account_dirty(mapping)) { + __inc_zone_page_state(page, NR_FILE_DIRTY); + task_io_account_write(PAGE_CACHE_SIZE); } - write_unlock_irq(&mapping->tree_lock); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - return 1; + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); } - return 0; + write_unlock_irq(&mapping->tree_lock); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return 1; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -2851,8 +2853,13 @@ int try_to_free_buffers(struct page *page) * could encounter a non-uptodate page, which is unresolvable. * This only applies in the rare case where try_to_free_buffers * succeeds but the page is not freed. + * + * Also, during truncate, discard_buffer will have marked all + * the page's buffers clean. We discover that here and clean + * the page also. */ - clear_page_dirty(page); + if (test_clear_page_dirty(page)) + task_io_account_cancelled_write(PAGE_CACHE_SIZE); } out: if (buffers_to_free) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 1aa95a50cac..0f05cab5d24 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -29,6 +29,7 @@ #include <linux/pagevec.h> #include <linux/smp_lock.h> #include <linux/writeback.h> +#include <linux/task_io_accounting_ops.h> #include <linux/delay.h> #include <asm/div64.h> #include "cifsfs.h" @@ -1812,6 +1813,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping, cFYI(1, ("Read error in readpages: %d", rc)); break; } else if (bytes_read > 0) { + task_io_account_read(bytes_read); pSMBr = (struct smb_com_read_rsp *)smb_read_data; cifs_copy_cache_pages(mapping, page_list, bytes_read, smb_read_data + 4 /* RFC1001 hdr */ + diff --git a/fs/compat.c b/fs/compat.c index b766964a625..0ec70e3cee0 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1679,19 +1679,19 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp, { fd_set_bits fds; char *bits; - int size, max_fdset, ret = -EINVAL; + int size, max_fds, ret = -EINVAL; struct fdtable *fdt; if (n < 0) goto out_nofds; - /* max_fdset can increase, so grab it once to avoid race */ + /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n > max_fdset) - n = max_fdset; + if (n > max_fds) + n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), diff --git a/fs/direct-io.c b/fs/direct-io.c index 5981e17f46f..d9d0833444f 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -27,6 +27,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/task_io_accounting_ops.h> #include <linux/bio.h> #include <linux/wait.h> #include <linux/err.h> @@ -121,8 +122,7 @@ struct dio { /* BIO completion state */ spinlock_t bio_lock; /* protects BIO fields below */ - int bio_count; /* nr bios to be completed */ - int bios_in_flight; /* nr bios in flight */ + unsigned long refcount; /* direct_io_worker() and bios */ struct bio *bio_list; /* singly linked via bi_private */ struct task_struct *waiter; /* waiting task (NULL if none) */ @@ -209,76 +209,55 @@ static struct page *dio_get_page(struct dio *dio) return dio->pages[dio->head++]; } -/* - * Called when all DIO BIO I/O has been completed - let the filesystem - * know, if it registered an interest earlier via get_block. Pass the - * private field of the map buffer_head so that filesystems can use it - * to hold additional state between get_block calls and dio_complete. - */ -static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) -{ - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); - if (dio->lock_type == DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); -} - -/* - * Called when a BIO has been processed. If the count goes to zero then IO is - * complete and we can signal this to the AIO layer. +/** + * dio_complete() - called when all DIO BIO I/O has been completed + * @offset: the byte offset in the file of the completed operation + * + * This releases locks as dictated by the locking type, lets interested parties + * know that a DIO operation has completed, and calculates the resulting return + * code for the operation. + * + * It lets the filesystem know if it registered an interest earlier via + * get_block. Pass the private field of the map buffer_head so that + * filesystems can use it to hold additional state between get_block calls and + * dio_complete. */ -static void finished_one_bio(struct dio *dio) +static int dio_complete(struct dio *dio, loff_t offset, int ret) { - unsigned long flags; + ssize_t transferred = 0; - spin_lock_irqsave(&dio->bio_lock, flags); - if (dio->bio_count == 1) { - if (dio->is_async) { - ssize_t transferred; - loff_t offset; - - /* - * Last reference to the dio is going away. - * Drop spinlock and complete the DIO. - */ - spin_unlock_irqrestore(&dio->bio_lock, flags); + /* + * AIO submission can race with bio completion to get here while + * expecting to have the last io completed by bio completion. + * In that case -EIOCBQUEUED is in fact not an error we want + * to preserve through this call. + */ + if (ret == -EIOCBQUEUED) + ret = 0; - /* Check for short read case */ - transferred = dio->result; - offset = dio->iocb->ki_pos; + if (dio->result) { + transferred = dio->result; - if ((dio->rw == READ) && - ((offset + transferred) > dio->i_size)) - transferred = dio->i_size - offset; + /* Check for short read case */ + if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) + transferred = dio->i_size - offset; + } - /* check for error in completion path */ - if (dio->io_error) - transferred = dio->io_error; + if (dio->end_io && dio->result) + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private); + if (dio->lock_type == DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); - dio_complete(dio, offset, transferred); + if (ret == 0) + ret = dio->page_errors; + if (ret == 0) + ret = dio->io_error; + if (ret == 0) + ret = transferred; - /* Complete AIO later if falling back to buffered i/o */ - if (dio->result == dio->size || - ((dio->rw == READ) && dio->result)) { - aio_complete(dio->iocb, transferred, 0); - kfree(dio); - return; - } else { - /* - * Falling back to buffered - */ - spin_lock_irqsave(&dio->bio_lock, flags); - dio->bio_count--; - if (dio->waiter) - wake_up_process(dio->waiter); - spin_unlock_irqrestore(&dio->bio_lock, flags); - return; - } - } - } - dio->bio_count--; - spin_unlock_irqrestore(&dio->bio_lock, flags); + return ret; } static int dio_bio_complete(struct dio *dio, struct bio *bio); @@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) { struct dio *dio = bio->bi_private; + unsigned long remaining; + unsigned long flags; if (bio->bi_size) return 1; /* cleanup the bio */ dio_bio_complete(dio, bio); + + spin_lock_irqsave(&dio->bio_lock, flags); + remaining = --dio->refcount; + if (remaining == 1 && dio->waiter) + wake_up_process(dio->waiter); + spin_unlock_irqrestore(&dio->bio_lock, flags); + + if (remaining == 0) { + int ret = dio_complete(dio, dio->iocb->ki_pos, 0); + aio_complete(dio->iocb, ret, 0); + kfree(dio); + } + return 0; } @@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) spin_lock_irqsave(&dio->bio_lock, flags); bio->bi_private = dio->bio_list; dio->bio_list = bio; - dio->bios_in_flight--; - if (dio->waiter && dio->bios_in_flight == 0) + if (--dio->refcount == 1 && dio->waiter) wake_up_process(dio->waiter); spin_unlock_irqrestore(&dio->bio_lock, flags); return 0; @@ -347,6 +340,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, * In the AIO read case we speculatively dirty the pages before starting IO. * During IO completion, any of these pages which happen to have been written * back will be redirtied by bio_check_pages_dirty(). + * + * bios hold a dio reference between submit_bio and ->end_io. */ static void dio_bio_submit(struct dio *dio) { @@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio) unsigned long flags; bio->bi_private = dio; + spin_lock_irqsave(&dio->bio_lock, flags); - dio->bio_count++; - dio->bios_in_flight++; + dio->refcount++; spin_unlock_irqrestore(&dio->bio_lock, flags); + if (dio->is_async && dio->rw == READ) bio_set_pages_dirty(bio); + submit_bio(dio->rw, bio); dio->bio = NULL; @@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio) } /* - * Wait for the next BIO to complete. Remove it and return it. + * Wait for the next BIO to complete. Remove it and return it. NULL is + * returned once all BIOs have been completed. This must only be called once + * all bios have been issued so that dio->refcount can only decrease. This + * requires that that the caller hold a reference on the dio. */ static struct bio *dio_await_one(struct dio *dio) { unsigned long flags; - struct bio *bio; + struct bio *bio = NULL; spin_lock_irqsave(&dio->bio_lock, flags); - while (dio->bio_list == NULL) { - set_current_state(TASK_UNINTERRUPTIBLE); - if (dio->bio_list == NULL) { - dio->waiter = current; - spin_unlock_irqrestore(&dio->bio_lock, flags); - blk_run_address_space(dio->inode->i_mapping); - io_schedule(); - spin_lock_irqsave(&dio->bio_lock, flags); - dio->waiter = NULL; - } - set_current_state(TASK_RUNNING); + + /* + * Wait as long as the list is empty and there are bios in flight. bio + * completion drops the count, maybe adds to the list, and wakes while + * holding the bio_lock so we don't need set_current_state()'s barrier + * and can call it after testing our condition. + */ + while (dio->refcount > 1 && dio->bio_list == NULL) { + __set_current_state(TASK_UNINTERRUPTIBLE); + dio->waiter = current; + spin_unlock_irqrestore(&dio->bio_lock, flags); + io_schedule(); + /* wake up sets us TASK_RUNNING */ + spin_lock_irqsave(&dio->bio_lock, flags); + dio->waiter = NULL; + } + if (dio->bio_list) { + bio = dio->bio_list; + dio->bio_list = bio->bi_private; } - bio = dio->bio_list; - dio->bio_list = bio->bi_private; spin_unlock_irqrestore(&dio->bio_lock, flags); return bio; } @@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) } bio_put(bio); } - finished_one_bio(dio); return uptodate ? 0 : -EIO; } /* - * Wait on and process all in-flight BIOs. + * Wait on and process all in-flight BIOs. This must only be called once + * all bios have been issued so that the refcount can only decrease. + * This just waits for all bios to make it through dio_bio_complete. IO + * errors are propogated through dio->io_error and should be propogated via + * dio_complete(). */ -static int dio_await_completion(struct dio *dio) +static void dio_await_completion(struct dio *dio) { - int ret = 0; - - if (dio->bio) - dio_bio_submit(dio); - - /* - * The bio_lock is not held for the read of bio_count. - * This is ok since it is the dio_bio_complete() that changes - * bio_count. - */ - while (dio->bio_count) { - struct bio *bio = dio_await_one(dio); - int ret2; - - ret2 = dio_bio_complete(dio, bio); - if (ret == 0) - ret = ret2; - } - return ret; + struct bio *bio; + do { + bio = dio_await_one(dio); + if (bio) + dio_bio_complete(dio, bio); + } while (bio); } /* @@ -675,6 +671,13 @@ submit_page_section(struct dio *dio, struct page *page, { int ret = 0; + if (dio->rw & WRITE) { + /* + * Read accounting is performed in submit_bio() + */ + task_io_account_write(len); + } + /* * Can we just grow the current page's presence in the dio? */ @@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, struct dio *dio) { unsigned long user_addr; + unsigned long flags; int seg; ssize_t ret = 0; ssize_t ret2; @@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, dio->iocb = iocb; dio->i_size = i_size_read(inode); - /* - * BIO completion state. - * - * ->bio_count starts out at one, and we decrement it to zero after all - * BIOs are submitted. This to avoid the situation where a really fast - * (or synchronous) device could take the count to zero while we're - * still submitting BIOs. - */ - dio->bio_count = 1; - dio->bios_in_flight = 0; spin_lock_init(&dio->bio_lock); + dio->refcount = 1; dio->bio_list = NULL; dio->waiter = NULL; @@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, if (dio->bio) dio_bio_submit(dio); + /* All IO is now issued, send it on its way */ + blk_run_address_space(inode->i_mapping); + /* * It is possible that, we return short IO due to end of file. * In that case, we need to release all the pages we got hold on. @@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, mutex_unlock(&dio->inode->i_mutex); /* - * OK, all BIOs are submitted, so we can decrement bio_count to truly - * reflect the number of to-be-processed BIOs. + * The only time we want to leave bios in flight is when a successful + * partial aio read or full aio write have been setup. In that case + * bio completion will call aio_complete. The only time it's safe to + * call aio_complete is when we return -EIOCBQUEUED, so we key on that. + * This had *better* be the only place that raises -EIOCBQUEUED. */ - if (dio->is_async) { - int should_wait = 0; + BUG_ON(ret == -EIOCBQUEUED); + if (dio->is_async && ret == 0 && dio->result && + ((rw & READ) || (dio->result == dio->size))) + ret = -EIOCBQUEUED; - if (dio->result < dio->size && (rw & WRITE)) { - dio->waiter = current; - should_wait = 1; - } - if (ret == 0) - ret = dio->result; - finished_one_bio(dio); /* This can free the dio */ - blk_run_address_space(inode->i_mapping); - if (should_wait) { - unsigned long flags; - /* - * Wait for already issued I/O to drain out and - * release its references to user-space pages - * before returning to fallback on buffered I/O - */ - - spin_lock_irqsave(&dio->bio_lock, flags); - set_current_state(TASK_UNINTERRUPTIBLE); - while (dio->bio_count) { - spin_unlock_irqrestore(&dio->bio_lock, flags); - io_schedule(); - spin_lock_irqsave(&dio->bio_lock, flags); - set_current_state(TASK_UNINTERRUPTIBLE); - } - spin_unlock_irqrestore(&dio->bio_lock, flags); - set_current_state(TASK_RUNNING); - kfree(dio); - } - } else { - ssize_t transferred = 0; - - finished_one_bio(dio); - ret2 = dio_await_completion(dio); - if (ret == 0) - ret = ret2; - if (ret == 0) - ret = dio->page_errors; - if (dio->result) { - loff_t i_size = i_size_read(inode); - - transferred = dio->result; - /* - * Adjust the return value if the read crossed a - * non-block-aligned EOF. - */ - if (rw == READ && (offset + transferred > i_size)) - transferred = i_size - offset; - } - dio_complete(dio, offset, transferred); - if (ret == 0) - ret = transferred; + if (ret != -EIOCBQUEUED) + dio_await_completion(dio); - /* We could have also come here on an AIO file extend */ - if (!is_sync_kiocb(iocb) && (rw & WRITE) && - ret >= 0 && dio->result == dio->size) - /* - * For AIO writes where we have completed the - * i/o, we have to mark the the aio complete. - */ - aio_complete(iocb, ret, 0); + /* + * Sync will always be dropping the final ref and completing the + * operation. AIO can if it was a broken operation described above or + * in fact if all the bios race to complete before we get here. In + * that case dio_complete() translates the EIOCBQUEUED into the proper + * return code that the caller will hand to aio_complete(). + * + * This is managed by the bio_lock instead of being an atomic_t so that + * completion paths can drop their ref and use the remaining count to + * decide to wake the submission path atomically. + */ + spin_lock_irqsave(&dio->bio_lock, flags); + ret2 = --dio->refcount; + spin_unlock_irqrestore(&dio->bio_lock, flags); + BUG_ON(!dio->is_async && ret2 != 0); + if (ret2 == 0) { + ret = dio_complete(dio, offset, ret); kfree(dio); - } + } else + BUG_ON(ret != -EIOCBQUEUED); + return ret; } diff --git a/fs/exec.c b/fs/exec.c index 12d8cd461b4..11fe93f7363 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -783,7 +783,7 @@ static void flush_old_files(struct files_struct * files) j++; i = j * __NFDBITS; fdt = files_fdtable(files); - if (i >= fdt->max_fds || i >= fdt->max_fdset) + if (i >= fdt->max_fds) break; set = fdt->close_on_exec->fds_bits[j]; if (!set) diff --git a/fs/fcntl.c b/fs/fcntl.c index 2bdaef35da5..8e382a5d51b 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -77,10 +77,9 @@ repeat: start = files->next_fd; newfd = start; - if (start < fdt->max_fdset) { + if (start < fdt->max_fds) newfd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fdset, start); - } + fdt->max_fds, start); error = -EMFILE; if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) diff --git a/fs/file.c b/fs/file.c index 51aef675470..857fa49e984 100644 --- a/fs/file.c +++ b/fs/file.c @@ -32,46 +32,28 @@ struct fdtable_defer { */ static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); - -/* - * Allocate an fd array, using kmalloc or vmalloc. - * Note: the array isn't cleared at allocation time. - */ -struct file ** alloc_fd_array(int num) +static inline void * alloc_fdmem(unsigned int size) { - struct file **new_fds; - int size = num * sizeof(struct file *); - if (size <= PAGE_SIZE) - new_fds = (struct file **) kmalloc(size, GFP_KERNEL); - else - new_fds = (struct file **) vmalloc(size); - return new_fds; + return kmalloc(size, GFP_KERNEL); + else + return vmalloc(size); } -void free_fd_array(struct file **array, int num) +static inline void free_fdarr(struct fdtable *fdt) { - int size = num * sizeof(struct file *); - - if (!array) { - printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num); - return; - } - - if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */ - return; - else if (size <= PAGE_SIZE) - kfree(array); + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) + kfree(fdt->fd); else - vfree(array); + vfree(fdt->fd); } -static void __free_fdtable(struct fdtable *fdt) +static inline void free_fdset(struct fdtable *fdt) { - free_fdset(fdt->open_fds, fdt->max_fdset); - free_fdset(fdt->close_on_exec, fdt->max_fdset); - free_fd_array(fdt->fd, fdt->max_fds); - kfree(fdt); + if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) + kfree(fdt->open_fds); + else + vfree(fdt->open_fds); } static void free_fdtable_work(struct work_struct *work) @@ -86,41 +68,32 @@ static void free_fdtable_work(struct work_struct *work) spin_unlock_bh(&f->lock); while(fdt) { struct fdtable *next = fdt->next; - __free_fdtable(fdt); + vfree(fdt->fd); + free_fdset(fdt); + kfree(fdt); fdt = next; } } -static void free_fdtable_rcu(struct rcu_head *rcu) +void free_fdtable_rcu(struct rcu_head *rcu) { struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); - int fdset_size, fdarray_size; struct fdtable_defer *fddef; BUG_ON(!fdt); - fdset_size = fdt->max_fdset / 8; - fdarray_size = fdt->max_fds * sizeof(struct file *); - if (fdt->free_files) { + if (fdt->max_fds <= NR_OPEN_DEFAULT) { /* - * The this fdtable was embedded in the files structure - * and the files structure itself was getting destroyed. - * It is now safe to free the files structure. + * This fdtable is embedded in the files structure and that + * structure itself is getting destroyed. */ - kmem_cache_free(files_cachep, fdt->free_files); + kmem_cache_free(files_cachep, + container_of(fdt, struct files_struct, fdtab)); return; } - if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE && - fdt->max_fds <= NR_OPEN_DEFAULT) { - /* - * The fdtable was embedded - */ - return; - } - if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) { - kfree(fdt->open_fds); - kfree(fdt->close_on_exec); + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { kfree(fdt->fd); + kfree(fdt->open_fds); kfree(fdt); } else { fddef = &get_cpu_var(fdtable_defer_list); @@ -134,136 +107,74 @@ static void free_fdtable_rcu(struct rcu_head *rcu) } } -void free_fdtable(struct fdtable *fdt) -{ - if (fdt->free_files || - fdt->max_fdset > EMBEDDED_FD_SET_SIZE || - fdt->max_fds > NR_OPEN_DEFAULT) - call_rcu(&fdt->rcu, free_fdtable_rcu); -} - /* * Expand the fdset in the files_struct. Called with the files spinlock * held for write. */ -static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt) +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) { - int i; - int count; - - BUG_ON(nfdt->max_fdset < fdt->max_fdset); - BUG_ON(nfdt->max_fds < fdt->max_fds); - /* Copy the existing tables and install the new pointers */ - - i = fdt->max_fdset / (sizeof(unsigned long) * 8); - count = (nfdt->max_fdset - fdt->max_fdset) / 8; + unsigned int cpy, set; - /* - * Don't copy the entire array if the current fdset is - * not yet initialised. - */ - if (i) { - memcpy (nfdt->open_fds, fdt->open_fds, - fdt->max_fdset/8); - memcpy (nfdt->close_on_exec, fdt->close_on_exec, - fdt->max_fdset/8); - memset (&nfdt->open_fds->fds_bits[i], 0, count); - memset (&nfdt->close_on_exec->fds_bits[i], 0, count); - } - - /* Don't copy/clear the array if we are creating a new - fd array for fork() */ - if (fdt->max_fds) { - memcpy(nfdt->fd, fdt->fd, - fdt->max_fds * sizeof(struct file *)); - /* clear the remainder of the array */ - memset(&nfdt->fd[fdt->max_fds], 0, - (nfdt->max_fds - fdt->max_fds) * - sizeof(struct file *)); - } -} - -/* - * Allocate an fdset array, using kmalloc or vmalloc. - * Note: the array isn't cleared at allocation time. - */ -fd_set * alloc_fdset(int num) -{ - fd_set *new_fdset; - int size = num / 8; + BUG_ON(nfdt->max_fds < ofdt->max_fds); + if (ofdt->max_fds == 0) + return; - if (size <= PAGE_SIZE) - new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); - else - new_fdset = (fd_set *) vmalloc(size); - return new_fdset; + cpy = ofdt->max_fds * sizeof(struct file *); + set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); + memcpy(nfdt->fd, ofdt->fd, cpy); + memset((char *)(nfdt->fd) + cpy, 0, set); + + cpy = ofdt->max_fds / BITS_PER_BYTE; + set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)(nfdt->open_fds) + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)(nfdt->close_on_exec) + cpy, 0, set); } -void free_fdset(fd_set *array, int num) +static struct fdtable * alloc_fdtable(unsigned int nr) { - if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */ - return; - else if (num <= 8 * PAGE_SIZE) - kfree(array); - else - vfree(array); -} + struct fdtable *fdt; + char *data; -static struct fdtable *alloc_fdtable(int nr) -{ - struct fdtable *fdt = NULL; - int nfds = 0; - fd_set *new_openset = NULL, *new_execset = NULL; - struct file **new_fds; + /* + * Figure out how many fds we actually want to support in this fdtable. + * Allocation steps are keyed to the size of the fdarray, since it + * grows far faster than any of the other dynamic data. We try to fit + * the fdarray into comfortable page-tuned chunks: starting at 1024B + * and growing in powers of two from there on. + */ + nr /= (1024 / sizeof(struct file *)); + nr = roundup_pow_of_two(nr + 1); + nr *= (1024 / sizeof(struct file *)); + if (nr > NR_OPEN) + nr = NR_OPEN; - fdt = kzalloc(sizeof(*fdt), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); if (!fdt) - goto out; - - nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1)); - if (nfds > NR_OPEN) - nfds = NR_OPEN; + goto out; + fdt->max_fds = nr; + data = alloc_fdmem(nr * sizeof(struct file *)); + if (!data) + goto out_fdt; + fdt->fd = (struct file **)data; + data = alloc_fdmem(max_t(unsigned int, + 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + if (!data) + goto out_arr; + fdt->open_fds = (fd_set *)data; + data += nr / BITS_PER_BYTE; + fdt->close_on_exec = (fd_set *)data; + INIT_RCU_HEAD(&fdt->rcu); + fdt->next = NULL; - new_openset = alloc_fdset(nfds); - new_execset = alloc_fdset(nfds); - if (!new_openset || !new_execset) - goto out; - fdt->open_fds = new_openset; - fdt->close_on_exec = new_execset; - fdt->max_fdset = nfds; - - nfds = NR_OPEN_DEFAULT; - /* - * Expand to the max in easy steps, and keep expanding it until - * we have enough for the requested fd array size. - */ - do { -#if NR_OPEN_DEFAULT < 256 - if (nfds < 256) - nfds = 256; - else -#endif - if (nfds < (PAGE_SIZE / sizeof(struct file *))) - nfds = PAGE_SIZE / sizeof(struct file *); - else { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } - } while (nfds <= nr); - new_fds = alloc_fd_array(nfds); - if (!new_fds) - goto out2; - fdt->fd = new_fds; - fdt->max_fds = nfds; - fdt->free_files = NULL; return fdt; -out2: - nfds = fdt->max_fdset; -out: - free_fdset(new_openset, nfds); - free_fdset(new_execset, nfds); + +out_arr: + free_fdarr(fdt); +out_fdt: kfree(fdt); +out: return NULL; } @@ -290,14 +201,17 @@ static int expand_fdtable(struct files_struct *files, int nr) * we dropped the lock */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) { + if (nr >= cur_fdt->max_fds) { /* Continue as planned */ copy_fdtable(new_fdt, cur_fdt); rcu_assign_pointer(files->fdt, new_fdt); - free_fdtable(cur_fdt); + if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); } else { /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); } return 1; } @@ -316,11 +230,10 @@ int expand_files(struct files_struct *files, int nr) fdt = files_fdtable(files); /* Do we need to expand? */ - if (nr < fdt->max_fdset && nr < fdt->max_fds) + if (nr < fdt->max_fds) return 0; /* Can we expand? */ - if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN || - nr >= NR_OPEN) + if (nr >= NR_OPEN) return -EMFILE; /* All good, so we try */ diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index d38e0d575e4..cceaf57e377 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -55,7 +55,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); /* Set up the commit timer for the new transaction. */ - journal->j_commit_timer.expires = transaction->t_expires; + journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); add_timer(&journal->j_commit_timer); J_ASSERT(journal->j_running_transaction == NULL); diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h index eb550b339bb..38f70ac03be 100644 --- a/fs/jfs/jfs_filsys.h +++ b/fs/jfs/jfs_filsys.h @@ -29,31 +29,21 @@ /* * file system option (superblock flag) */ -/* mount time flag to disable journaling to disk */ -#define JFS_NOINTEGRITY 0x00000010 + +/* directory option */ +#define JFS_UNICODE 0x00000001 /* unicode name */ /* mount time flags for error handling */ #define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */ #define JFS_ERR_CONTINUE 0x00000004 /* continue */ #define JFS_ERR_PANIC 0x00000008 /* panic */ +/* Quota support */ #define JFS_USRQUOTA 0x00000010 #define JFS_GRPQUOTA 0x00000020 -/* platform option (conditional compilation) */ -#define JFS_AIX 0x80000000 /* AIX support */ -/* POSIX name/directory support */ - -#define JFS_OS2 0x40000000 /* OS/2 support */ -/* case-insensitive name/directory support */ - -#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ - -#define JFS_LINUX 0x10000000 /* Linux support */ -/* case-sensitive name/directory support */ - -/* directory option */ -#define JFS_UNICODE 0x00000001 /* unicode name */ +/* mount time flag to disable journaling to disk */ +#define JFS_NOINTEGRITY 0x00000040 /* commit option */ #define JFS_COMMIT 0x00000f00 /* commit option mask */ @@ -61,6 +51,7 @@ #define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */ #define JFS_TMPFS 0x00000400 /* temporary file system - * do not log/commit: + * Never implemented */ /* log logical volume option */ @@ -74,16 +65,25 @@ #define JFS_SPARSE 0x00020000 /* sparse regular file */ /* DASD Limits F226941 */ -#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ -#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ +#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */ +#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */ /* big endian flag */ -#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ +#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */ /* Directory index */ -#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ - /* directory entries */ +#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */ +/* platform options */ +#define JFS_LINUX 0x10000000 /* Linux support */ +#define JFS_DFS 0x20000000 /* DCE DFS LFS support */ +/* Never implemented */ + +#define JFS_OS2 0x40000000 /* OS/2 support */ +/* case-insensitive name/directory support */ + +#define JFS_AIX 0x80000000 /* AIX support */ +/* POSIX name/directory support - Never implemented*/ /* * buffer cache configuration diff --git a/fs/open.c b/fs/open.c index 0d94319e868..c989fb4cf7b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -864,8 +864,7 @@ int get_unused_fd(void) repeat: fdt = files_fdtable(files); - fd = find_next_zero_bit(fdt->open_fds->fds_bits, - fdt->max_fdset, + fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds, files->next_fd); /* diff --git a/fs/proc/base.c b/fs/proc/base.c index fd959d5b5a8..77a57b5799c 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1804,6 +1804,27 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filld proc_base_instantiate, task, p); } +#ifdef CONFIG_TASK_IO_ACCOUNTING +static int proc_pid_io_accounting(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, + "rchar: %llu\n" + "wchar: %llu\n" + "syscr: %llu\n" + "syscw: %llu\n" + "read_bytes: %llu\n" + "write_bytes: %llu\n" + "cancelled_write_bytes: %llu\n", + (unsigned long long)task->rchar, + (unsigned long long)task->wchar, + (unsigned long long)task->syscr, + (unsigned long long)task->syscw, + (unsigned long long)task->ioac.read_bytes, + (unsigned long long)task->ioac.write_bytes, + (unsigned long long)task->ioac.cancelled_write_bytes); +} +#endif + /* * Thread groups */ @@ -1855,6 +1876,9 @@ static struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_FAULT_INJECTION REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), #endif +#ifdef CONFIG_TASK_IO_ACCOUNTING + INF("io", S_IRUGO, pid_io_accounting), +#endif }; static int proc_tgid_base_readdir(struct file * filp, diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index dc3e580d1dc..92ea7743fe8 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -47,6 +47,7 @@ #include <linux/vmalloc.h> #include <linux/crash_dump.h> #include <linux/pid_namespace.h> +#include <linux/compile.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/io.h> @@ -253,8 +254,15 @@ static int version_read_proc(char *page, char **start, off_t off, { int len; - len = sprintf(page, linux_banner, - utsname()->release, utsname()->version); + /* FIXED STRING! Don't touch! */ + len = snprintf(page, PAGE_SIZE, + "%s version %s" + " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")" + " (" LINUX_COMPILER ")" + " %s\n", + utsname()->sysname, + utsname()->release, + utsname()->version); return proc_calc_metrics(page, start, off, count, eof, len); } diff --git a/fs/select.c b/fs/select.c index dcbc1112b7e..fe0893afd93 100644 --- a/fs/select.c +++ b/fs/select.c @@ -311,7 +311,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, { fd_set_bits fds; void *bits; - int ret, max_fdset; + int ret, max_fds; unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ @@ -321,13 +321,13 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, if (n < 0) goto out_nofds; - /* max_fdset can increase, so grab it once to avoid race */ + /* max_fds can increase, so grab it once to avoid race */ rcu_read_lock(); fdt = files_fdtable(current->files); - max_fdset = fdt->max_fdset; + max_fds = fdt->max_fds; rcu_read_unlock(); - if (n > max_fdset) - n = max_fdset; + if (n > max_fds) + n = max_fds; /* * We need 6 bitmaps (in/out/ex for both incoming and outgoing), diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 8e6b56fc1ca..b56eb754e2d 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1406,7 +1406,7 @@ xfs_vm_direct_IO( xfs_end_io_direct); } - if (unlikely(ret <= 0 && iocb->private)) + if (unlikely(ret != -EIOCBQUEUED && iocb->private)) xfs_destroy_ioend(iocb->private); return ret; } |