summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/buffer.c33
-rw-r--r--fs/cifs/file.c2
-rw-r--r--fs/compat.c10
-rw-r--r--fs/direct-io.c323
-rw-r--r--fs/exec.c2
-rw-r--r--fs/fcntl.c5
-rw-r--r--fs/file.c255
-rw-r--r--fs/jbd/transaction.c2
-rw-r--r--fs/jfs/jfs_filsys.h42
-rw-r--r--fs/open.c3
-rw-r--r--fs/proc/base.c24
-rw-r--r--fs/proc/proc_misc.c12
-rw-r--r--fs/select.c10
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
14 files changed, 321 insertions, 404 deletions
diff --git a/fs/buffer.c b/fs/buffer.c
index 517860f2d75..d1f1b54d310 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -35,6 +35,7 @@
#include <linux/hash.h>
#include <linux/suspend.h>
#include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/notifier.h>
#include <linux/cpu.h>
@@ -724,20 +725,21 @@ int __set_page_dirty_buffers(struct page *page)
}
spin_unlock(&mapping->private_lock);
- if (!TestSetPageDirty(page)) {
- write_lock_irq(&mapping->tree_lock);
- if (page->mapping) { /* Race with truncate? */
- if (mapping_cap_account_dirty(mapping))
- __inc_zone_page_state(page, NR_FILE_DIRTY);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page),
- PAGECACHE_TAG_DIRTY);
+ if (TestSetPageDirty(page))
+ return 0;
+
+ write_lock_irq(&mapping->tree_lock);
+ if (page->mapping) { /* Race with truncate? */
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ task_io_account_write(PAGE_CACHE_SIZE);
}
- write_unlock_irq(&mapping->tree_lock);
- __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
- return 1;
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
}
- return 0;
+ write_unlock_irq(&mapping->tree_lock);
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+ return 1;
}
EXPORT_SYMBOL(__set_page_dirty_buffers);
@@ -2851,8 +2853,13 @@ int try_to_free_buffers(struct page *page)
* could encounter a non-uptodate page, which is unresolvable.
* This only applies in the rare case where try_to_free_buffers
* succeeds but the page is not freed.
+ *
+ * Also, during truncate, discard_buffer will have marked all
+ * the page's buffers clean. We discover that here and clean
+ * the page also.
*/
- clear_page_dirty(page);
+ if (test_clear_page_dirty(page))
+ task_io_account_cancelled_write(PAGE_CACHE_SIZE);
}
out:
if (buffers_to_free) {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 1aa95a50cac..0f05cab5d24 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -29,6 +29,7 @@
#include <linux/pagevec.h>
#include <linux/smp_lock.h>
#include <linux/writeback.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/delay.h>
#include <asm/div64.h>
#include "cifsfs.h"
@@ -1812,6 +1813,7 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
cFYI(1, ("Read error in readpages: %d", rc));
break;
} else if (bytes_read > 0) {
+ task_io_account_read(bytes_read);
pSMBr = (struct smb_com_read_rsp *)smb_read_data;
cifs_copy_cache_pages(mapping, page_list, bytes_read,
smb_read_data + 4 /* RFC1001 hdr */ +
diff --git a/fs/compat.c b/fs/compat.c
index b766964a625..0ec70e3cee0 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1679,19 +1679,19 @@ int compat_core_sys_select(int n, compat_ulong_t __user *inp,
{
fd_set_bits fds;
char *bits;
- int size, max_fdset, ret = -EINVAL;
+ int size, max_fds, ret = -EINVAL;
struct fdtable *fdt;
if (n < 0)
goto out_nofds;
- /* max_fdset can increase, so grab it once to avoid race */
+ /* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
fdt = files_fdtable(current->files);
- max_fdset = fdt->max_fdset;
+ max_fds = fdt->max_fds;
rcu_read_unlock();
- if (n > max_fdset)
- n = max_fdset;
+ if (n > max_fds)
+ n = max_fds;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5981e17f46f..d9d0833444f 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -27,6 +27,7 @@
#include <linux/slab.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
+#include <linux/task_io_accounting_ops.h>
#include <linux/bio.h>
#include <linux/wait.h>
#include <linux/err.h>
@@ -121,8 +122,7 @@ struct dio {
/* BIO completion state */
spinlock_t bio_lock; /* protects BIO fields below */
- int bio_count; /* nr bios to be completed */
- int bios_in_flight; /* nr bios in flight */
+ unsigned long refcount; /* direct_io_worker() and bios */
struct bio *bio_list; /* singly linked via bi_private */
struct task_struct *waiter; /* waiting task (NULL if none) */
@@ -209,76 +209,55 @@ static struct page *dio_get_page(struct dio *dio)
return dio->pages[dio->head++];
}
-/*
- * Called when all DIO BIO I/O has been completed - let the filesystem
- * know, if it registered an interest earlier via get_block. Pass the
- * private field of the map buffer_head so that filesystems can use it
- * to hold additional state between get_block calls and dio_complete.
- */
-static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
-{
- if (dio->end_io && dio->result)
- dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
- if (dio->lock_type == DIO_LOCKING)
- /* lockdep: non-owner release */
- up_read_non_owner(&dio->inode->i_alloc_sem);
-}
-
-/*
- * Called when a BIO has been processed. If the count goes to zero then IO is
- * complete and we can signal this to the AIO layer.
+/**
+ * dio_complete() - called when all DIO BIO I/O has been completed
+ * @offset: the byte offset in the file of the completed operation
+ *
+ * This releases locks as dictated by the locking type, lets interested parties
+ * know that a DIO operation has completed, and calculates the resulting return
+ * code for the operation.
+ *
+ * It lets the filesystem know if it registered an interest earlier via
+ * get_block. Pass the private field of the map buffer_head so that
+ * filesystems can use it to hold additional state between get_block calls and
+ * dio_complete.
*/
-static void finished_one_bio(struct dio *dio)
+static int dio_complete(struct dio *dio, loff_t offset, int ret)
{
- unsigned long flags;
+ ssize_t transferred = 0;
- spin_lock_irqsave(&dio->bio_lock, flags);
- if (dio->bio_count == 1) {
- if (dio->is_async) {
- ssize_t transferred;
- loff_t offset;
-
- /*
- * Last reference to the dio is going away.
- * Drop spinlock and complete the DIO.
- */
- spin_unlock_irqrestore(&dio->bio_lock, flags);
+ /*
+ * AIO submission can race with bio completion to get here while
+ * expecting to have the last io completed by bio completion.
+ * In that case -EIOCBQUEUED is in fact not an error we want
+ * to preserve through this call.
+ */
+ if (ret == -EIOCBQUEUED)
+ ret = 0;
- /* Check for short read case */
- transferred = dio->result;
- offset = dio->iocb->ki_pos;
+ if (dio->result) {
+ transferred = dio->result;
- if ((dio->rw == READ) &&
- ((offset + transferred) > dio->i_size))
- transferred = dio->i_size - offset;
+ /* Check for short read case */
+ if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
+ transferred = dio->i_size - offset;
+ }
- /* check for error in completion path */
- if (dio->io_error)
- transferred = dio->io_error;
+ if (dio->end_io && dio->result)
+ dio->end_io(dio->iocb, offset, transferred,
+ dio->map_bh.b_private);
+ if (dio->lock_type == DIO_LOCKING)
+ /* lockdep: non-owner release */
+ up_read_non_owner(&dio->inode->i_alloc_sem);
- dio_complete(dio, offset, transferred);
+ if (ret == 0)
+ ret = dio->page_errors;
+ if (ret == 0)
+ ret = dio->io_error;
+ if (ret == 0)
+ ret = transferred;
- /* Complete AIO later if falling back to buffered i/o */
- if (dio->result == dio->size ||
- ((dio->rw == READ) && dio->result)) {
- aio_complete(dio->iocb, transferred, 0);
- kfree(dio);
- return;
- } else {
- /*
- * Falling back to buffered
- */
- spin_lock_irqsave(&dio->bio_lock, flags);
- dio->bio_count--;
- if (dio->waiter)
- wake_up_process(dio->waiter);
- spin_unlock_irqrestore(&dio->bio_lock, flags);
- return;
- }
- }
- }
- dio->bio_count--;
- spin_unlock_irqrestore(&dio->bio_lock, flags);
+ return ret;
}
static int dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
{
struct dio *dio = bio->bi_private;
+ unsigned long remaining;
+ unsigned long flags;
if (bio->bi_size)
return 1;
/* cleanup the bio */
dio_bio_complete(dio, bio);
+
+ spin_lock_irqsave(&dio->bio_lock, flags);
+ remaining = --dio->refcount;
+ if (remaining == 1 && dio->waiter)
+ wake_up_process(dio->waiter);
+ spin_unlock_irqrestore(&dio->bio_lock, flags);
+
+ if (remaining == 0) {
+ int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
+ aio_complete(dio->iocb, ret, 0);
+ kfree(dio);
+ }
+
return 0;
}
@@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
spin_lock_irqsave(&dio->bio_lock, flags);
bio->bi_private = dio->bio_list;
dio->bio_list = bio;
- dio->bios_in_flight--;
- if (dio->waiter && dio->bios_in_flight == 0)
+ if (--dio->refcount == 1 && dio->waiter)
wake_up_process(dio->waiter);
spin_unlock_irqrestore(&dio->bio_lock, flags);
return 0;
@@ -347,6 +340,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
* In the AIO read case we speculatively dirty the pages before starting IO.
* During IO completion, any of these pages which happen to have been written
* back will be redirtied by bio_check_pages_dirty().
+ *
+ * bios hold a dio reference between submit_bio and ->end_io.
*/
static void dio_bio_submit(struct dio *dio)
{
@@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio)
unsigned long flags;
bio->bi_private = dio;
+
spin_lock_irqsave(&dio->bio_lock, flags);
- dio->bio_count++;
- dio->bios_in_flight++;
+ dio->refcount++;
spin_unlock_irqrestore(&dio->bio_lock, flags);
+
if (dio->is_async && dio->rw == READ)
bio_set_pages_dirty(bio);
+
submit_bio(dio->rw, bio);
dio->bio = NULL;
@@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio)
}
/*
- * Wait for the next BIO to complete. Remove it and return it.
+ * Wait for the next BIO to complete. Remove it and return it. NULL is
+ * returned once all BIOs have been completed. This must only be called once
+ * all bios have been issued so that dio->refcount can only decrease. This
+ * requires that that the caller hold a reference on the dio.
*/
static struct bio *dio_await_one(struct dio *dio)
{
unsigned long flags;
- struct bio *bio;
+ struct bio *bio = NULL;
spin_lock_irqsave(&dio->bio_lock, flags);
- while (dio->bio_list == NULL) {
- set_current_state(TASK_UNINTERRUPTIBLE);
- if (dio->bio_list == NULL) {
- dio->waiter = current;
- spin_unlock_irqrestore(&dio->bio_lock, flags);
- blk_run_address_space(dio->inode->i_mapping);
- io_schedule();
- spin_lock_irqsave(&dio->bio_lock, flags);
- dio->waiter = NULL;
- }
- set_current_state(TASK_RUNNING);
+
+ /*
+ * Wait as long as the list is empty and there are bios in flight. bio
+ * completion drops the count, maybe adds to the list, and wakes while
+ * holding the bio_lock so we don't need set_current_state()'s barrier
+ * and can call it after testing our condition.
+ */
+ while (dio->refcount > 1 && dio->bio_list == NULL) {
+ __set_current_state(TASK_UNINTERRUPTIBLE);
+ dio->waiter = current;
+ spin_unlock_irqrestore(&dio->bio_lock, flags);
+ io_schedule();
+ /* wake up sets us TASK_RUNNING */
+ spin_lock_irqsave(&dio->bio_lock, flags);
+ dio->waiter = NULL;
+ }
+ if (dio->bio_list) {
+ bio = dio->bio_list;
+ dio->bio_list = bio->bi_private;
}
- bio = dio->bio_list;
- dio->bio_list = bio->bi_private;
spin_unlock_irqrestore(&dio->bio_lock, flags);
return bio;
}
@@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
}
bio_put(bio);
}
- finished_one_bio(dio);
return uptodate ? 0 : -EIO;
}
/*
- * Wait on and process all in-flight BIOs.
+ * Wait on and process all in-flight BIOs. This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through dio_bio_complete. IO
+ * errors are propogated through dio->io_error and should be propogated via
+ * dio_complete().
*/
-static int dio_await_completion(struct dio *dio)
+static void dio_await_completion(struct dio *dio)
{
- int ret = 0;
-
- if (dio->bio)
- dio_bio_submit(dio);
-
- /*
- * The bio_lock is not held for the read of bio_count.
- * This is ok since it is the dio_bio_complete() that changes
- * bio_count.
- */
- while (dio->bio_count) {
- struct bio *bio = dio_await_one(dio);
- int ret2;
-
- ret2 = dio_bio_complete(dio, bio);
- if (ret == 0)
- ret = ret2;
- }
- return ret;
+ struct bio *bio;
+ do {
+ bio = dio_await_one(dio);
+ if (bio)
+ dio_bio_complete(dio, bio);
+ } while (bio);
}
/*
@@ -675,6 +671,13 @@ submit_page_section(struct dio *dio, struct page *page,
{
int ret = 0;
+ if (dio->rw & WRITE) {
+ /*
+ * Read accounting is performed in submit_bio()
+ */
+ task_io_account_write(len);
+ }
+
/*
* Can we just grow the current page's presence in the dio?
*/
@@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
struct dio *dio)
{
unsigned long user_addr;
+ unsigned long flags;
int seg;
ssize_t ret = 0;
ssize_t ret2;
@@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
dio->iocb = iocb;
dio->i_size = i_size_read(inode);
- /*
- * BIO completion state.
- *
- * ->bio_count starts out at one, and we decrement it to zero after all
- * BIOs are submitted. This to avoid the situation where a really fast
- * (or synchronous) device could take the count to zero while we're
- * still submitting BIOs.
- */
- dio->bio_count = 1;
- dio->bios_in_flight = 0;
spin_lock_init(&dio->bio_lock);
+ dio->refcount = 1;
dio->bio_list = NULL;
dio->waiter = NULL;
@@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
if (dio->bio)
dio_bio_submit(dio);
+ /* All IO is now issued, send it on its way */
+ blk_run_address_space(inode->i_mapping);
+
/*
* It is possible that, we return short IO due to end of file.
* In that case, we need to release all the pages we got hold on.
@@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
mutex_unlock(&dio->inode->i_mutex);
/*
- * OK, all BIOs are submitted, so we can decrement bio_count to truly
- * reflect the number of to-be-processed BIOs.
+ * The only time we want to leave bios in flight is when a successful
+ * partial aio read or full aio write have been setup. In that case
+ * bio completion will call aio_complete. The only time it's safe to
+ * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+ * This had *better* be the only place that raises -EIOCBQUEUED.
*/
- if (dio->is_async) {
- int should_wait = 0;
+ BUG_ON(ret == -EIOCBQUEUED);
+ if (dio->is_async && ret == 0 && dio->result &&
+ ((rw & READ) || (dio->result == dio->size)))
+ ret = -EIOCBQUEUED;
- if (dio->result < dio->size && (rw & WRITE)) {
- dio->waiter = current;
- should_wait = 1;
- }
- if (ret == 0)
- ret = dio->result;
- finished_one_bio(dio); /* This can free the dio */
- blk_run_address_space(inode->i_mapping);
- if (should_wait) {
- unsigned long flags;
- /*
- * Wait for already issued I/O to drain out and
- * release its references to user-space pages
- * before returning to fallback on buffered I/O
- */
-
- spin_lock_irqsave(&dio->bio_lock, flags);
- set_current_state(TASK_UNINTERRUPTIBLE);
- while (dio->bio_count) {
- spin_unlock_irqrestore(&dio->bio_lock, flags);
- io_schedule();
- spin_lock_irqsave(&dio->bio_lock, flags);
- set_current_state(TASK_UNINTERRUPTIBLE);
- }
- spin_unlock_irqrestore(&dio->bio_lock, flags);
- set_current_state(TASK_RUNNING);
- kfree(dio);
- }
- } else {
- ssize_t transferred = 0;
-
- finished_one_bio(dio);
- ret2 = dio_await_completion(dio);
- if (ret == 0)
- ret = ret2;
- if (ret == 0)
- ret = dio->page_errors;
- if (dio->result) {
- loff_t i_size = i_size_read(inode);
-
- transferred = dio->result;
- /*
- * Adjust the return value if the read crossed a
- * non-block-aligned EOF.
- */
- if (rw == READ && (offset + transferred > i_size))
- transferred = i_size - offset;
- }
- dio_complete(dio, offset, transferred);
- if (ret == 0)
- ret = transferred;
+ if (ret != -EIOCBQUEUED)
+ dio_await_completion(dio);
- /* We could have also come here on an AIO file extend */
- if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
- ret >= 0 && dio->result == dio->size)
- /*
- * For AIO writes where we have completed the
- * i/o, we have to mark the the aio complete.
- */
- aio_complete(iocb, ret, 0);
+ /*
+ * Sync will always be dropping the final ref and completing the
+ * operation. AIO can if it was a broken operation described above or
+ * in fact if all the bios race to complete before we get here. In
+ * that case dio_complete() translates the EIOCBQUEUED into the proper
+ * return code that the caller will hand to aio_complete().
+ *
+ * This is managed by the bio_lock instead of being an atomic_t so that
+ * completion paths can drop their ref and use the remaining count to
+ * decide to wake the submission path atomically.
+ */
+ spin_lock_irqsave(&dio->bio_lock, flags);
+ ret2 = --dio->refcount;
+ spin_unlock_irqrestore(&dio->bio_lock, flags);
+ BUG_ON(!dio->is_async && ret2 != 0);
+ if (ret2 == 0) {
+ ret = dio_complete(dio, offset, ret);
kfree(dio);
- }
+ } else
+ BUG_ON(ret != -EIOCBQUEUED);
+
return ret;
}
diff --git a/fs/exec.c b/fs/exec.c
index 12d8cd461b4..11fe93f7363 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -783,7 +783,7 @@ static void flush_old_files(struct files_struct * files)
j++;
i = j * __NFDBITS;
fdt = files_fdtable(files);
- if (i >= fdt->max_fds || i >= fdt->max_fdset)
+ if (i >= fdt->max_fds)
break;
set = fdt->close_on_exec->fds_bits[j];
if (!set)
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 2bdaef35da5..8e382a5d51b 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -77,10 +77,9 @@ repeat:
start = files->next_fd;
newfd = start;
- if (start < fdt->max_fdset) {
+ if (start < fdt->max_fds)
newfd = find_next_zero_bit(fdt->open_fds->fds_bits,
- fdt->max_fdset, start);
- }
+ fdt->max_fds, start);
error = -EMFILE;
if (newfd >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur)
diff --git a/fs/file.c b/fs/file.c
index 51aef675470..857fa49e984 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -32,46 +32,28 @@ struct fdtable_defer {
*/
static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list);
-
-/*
- * Allocate an fd array, using kmalloc or vmalloc.
- * Note: the array isn't cleared at allocation time.
- */
-struct file ** alloc_fd_array(int num)
+static inline void * alloc_fdmem(unsigned int size)
{
- struct file **new_fds;
- int size = num * sizeof(struct file *);
-
if (size <= PAGE_SIZE)
- new_fds = (struct file **) kmalloc(size, GFP_KERNEL);
- else
- new_fds = (struct file **) vmalloc(size);
- return new_fds;
+ return kmalloc(size, GFP_KERNEL);
+ else
+ return vmalloc(size);
}
-void free_fd_array(struct file **array, int num)
+static inline void free_fdarr(struct fdtable *fdt)
{
- int size = num * sizeof(struct file *);
-
- if (!array) {
- printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num);
- return;
- }
-
- if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */
- return;
- else if (size <= PAGE_SIZE)
- kfree(array);
+ if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *)))
+ kfree(fdt->fd);
else
- vfree(array);
+ vfree(fdt->fd);
}
-static void __free_fdtable(struct fdtable *fdt)
+static inline void free_fdset(struct fdtable *fdt)
{
- free_fdset(fdt->open_fds, fdt->max_fdset);
- free_fdset(fdt->close_on_exec, fdt->max_fdset);
- free_fd_array(fdt->fd, fdt->max_fds);
- kfree(fdt);
+ if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2))
+ kfree(fdt->open_fds);
+ else
+ vfree(fdt->open_fds);
}
static void free_fdtable_work(struct work_struct *work)
@@ -86,41 +68,32 @@ static void free_fdtable_work(struct work_struct *work)
spin_unlock_bh(&f->lock);
while(fdt) {
struct fdtable *next = fdt->next;
- __free_fdtable(fdt);
+ vfree(fdt->fd);
+ free_fdset(fdt);
+ kfree(fdt);
fdt = next;
}
}
-static void free_fdtable_rcu(struct rcu_head *rcu)
+void free_fdtable_rcu(struct rcu_head *rcu)
{
struct fdtable *fdt = container_of(rcu, struct fdtable, rcu);
- int fdset_size, fdarray_size;
struct fdtable_defer *fddef;
BUG_ON(!fdt);
- fdset_size = fdt->max_fdset / 8;
- fdarray_size = fdt->max_fds * sizeof(struct file *);
- if (fdt->free_files) {
+ if (fdt->max_fds <= NR_OPEN_DEFAULT) {
/*
- * The this fdtable was embedded in the files structure
- * and the files structure itself was getting destroyed.
- * It is now safe to free the files structure.
+ * This fdtable is embedded in the files structure and that
+ * structure itself is getting destroyed.
*/
- kmem_cache_free(files_cachep, fdt->free_files);
+ kmem_cache_free(files_cachep,
+ container_of(fdt, struct files_struct, fdtab));
return;
}
- if (fdt->max_fdset <= EMBEDDED_FD_SET_SIZE &&
- fdt->max_fds <= NR_OPEN_DEFAULT) {
- /*
- * The fdtable was embedded
- */
- return;
- }
- if (fdset_size <= PAGE_SIZE && fdarray_size <= PAGE_SIZE) {
- kfree(fdt->open_fds);
- kfree(fdt->close_on_exec);
+ if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) {
kfree(fdt->fd);
+ kfree(fdt->open_fds);
kfree(fdt);
} else {
fddef = &get_cpu_var(fdtable_defer_list);
@@ -134,136 +107,74 @@ static void free_fdtable_rcu(struct rcu_head *rcu)
}
}
-void free_fdtable(struct fdtable *fdt)
-{
- if (fdt->free_files ||
- fdt->max_fdset > EMBEDDED_FD_SET_SIZE ||
- fdt->max_fds > NR_OPEN_DEFAULT)
- call_rcu(&fdt->rcu, free_fdtable_rcu);
-}
-
/*
* Expand the fdset in the files_struct. Called with the files spinlock
* held for write.
*/
-static void copy_fdtable(struct fdtable *nfdt, struct fdtable *fdt)
+static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt)
{
- int i;
- int count;
-
- BUG_ON(nfdt->max_fdset < fdt->max_fdset);
- BUG_ON(nfdt->max_fds < fdt->max_fds);
- /* Copy the existing tables and install the new pointers */
-
- i = fdt->max_fdset / (sizeof(unsigned long) * 8);
- count = (nfdt->max_fdset - fdt->max_fdset) / 8;
+ unsigned int cpy, set;
- /*
- * Don't copy the entire array if the current fdset is
- * not yet initialised.
- */
- if (i) {
- memcpy (nfdt->open_fds, fdt->open_fds,
- fdt->max_fdset/8);
- memcpy (nfdt->close_on_exec, fdt->close_on_exec,
- fdt->max_fdset/8);
- memset (&nfdt->open_fds->fds_bits[i], 0, count);
- memset (&nfdt->close_on_exec->fds_bits[i], 0, count);
- }
-
- /* Don't copy/clear the array if we are creating a new
- fd array for fork() */
- if (fdt->max_fds) {
- memcpy(nfdt->fd, fdt->fd,
- fdt->max_fds * sizeof(struct file *));
- /* clear the remainder of the array */
- memset(&nfdt->fd[fdt->max_fds], 0,
- (nfdt->max_fds - fdt->max_fds) *
- sizeof(struct file *));
- }
-}
-
-/*
- * Allocate an fdset array, using kmalloc or vmalloc.
- * Note: the array isn't cleared at allocation time.
- */
-fd_set * alloc_fdset(int num)
-{
- fd_set *new_fdset;
- int size = num / 8;
+ BUG_ON(nfdt->max_fds < ofdt->max_fds);
+ if (ofdt->max_fds == 0)
+ return;
- if (size <= PAGE_SIZE)
- new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL);
- else
- new_fdset = (fd_set *) vmalloc(size);
- return new_fdset;
+ cpy = ofdt->max_fds * sizeof(struct file *);
+ set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *);
+ memcpy(nfdt->fd, ofdt->fd, cpy);
+ memset((char *)(nfdt->fd) + cpy, 0, set);
+
+ cpy = ofdt->max_fds / BITS_PER_BYTE;
+ set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE;
+ memcpy(nfdt->open_fds, ofdt->open_fds, cpy);
+ memset((char *)(nfdt->open_fds) + cpy, 0, set);
+ memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy);
+ memset((char *)(nfdt->close_on_exec) + cpy, 0, set);
}
-void free_fdset(fd_set *array, int num)
+static struct fdtable * alloc_fdtable(unsigned int nr)
{
- if (num <= EMBEDDED_FD_SET_SIZE) /* Don't free an embedded fdset */
- return;
- else if (num <= 8 * PAGE_SIZE)
- kfree(array);
- else
- vfree(array);
-}
+ struct fdtable *fdt;
+ char *data;
-static struct fdtable *alloc_fdtable(int nr)
-{
- struct fdtable *fdt = NULL;
- int nfds = 0;
- fd_set *new_openset = NULL, *new_execset = NULL;
- struct file **new_fds;
+ /*
+ * Figure out how many fds we actually want to support in this fdtable.
+ * Allocation steps are keyed to the size of the fdarray, since it
+ * grows far faster than any of the other dynamic data. We try to fit
+ * the fdarray into comfortable page-tuned chunks: starting at 1024B
+ * and growing in powers of two from there on.
+ */
+ nr /= (1024 / sizeof(struct file *));
+ nr = roundup_pow_of_two(nr + 1);
+ nr *= (1024 / sizeof(struct file *));
+ if (nr > NR_OPEN)
+ nr = NR_OPEN;
- fdt = kzalloc(sizeof(*fdt), GFP_KERNEL);
+ fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
if (!fdt)
- goto out;
-
- nfds = max_t(int, 8 * L1_CACHE_BYTES, roundup_pow_of_two(nr + 1));
- if (nfds > NR_OPEN)
- nfds = NR_OPEN;
+ goto out;
+ fdt->max_fds = nr;
+ data = alloc_fdmem(nr * sizeof(struct file *));
+ if (!data)
+ goto out_fdt;
+ fdt->fd = (struct file **)data;
+ data = alloc_fdmem(max_t(unsigned int,
+ 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES));
+ if (!data)
+ goto out_arr;
+ fdt->open_fds = (fd_set *)data;
+ data += nr / BITS_PER_BYTE;
+ fdt->close_on_exec = (fd_set *)data;
+ INIT_RCU_HEAD(&fdt->rcu);
+ fdt->next = NULL;
- new_openset = alloc_fdset(nfds);
- new_execset = alloc_fdset(nfds);
- if (!new_openset || !new_execset)
- goto out;
- fdt->open_fds = new_openset;
- fdt->close_on_exec = new_execset;
- fdt->max_fdset = nfds;
-
- nfds = NR_OPEN_DEFAULT;
- /*
- * Expand to the max in easy steps, and keep expanding it until
- * we have enough for the requested fd array size.
- */
- do {
-#if NR_OPEN_DEFAULT < 256
- if (nfds < 256)
- nfds = 256;
- else
-#endif
- if (nfds < (PAGE_SIZE / sizeof(struct file *)))
- nfds = PAGE_SIZE / sizeof(struct file *);
- else {
- nfds = nfds * 2;
- if (nfds > NR_OPEN)
- nfds = NR_OPEN;
- }
- } while (nfds <= nr);
- new_fds = alloc_fd_array(nfds);
- if (!new_fds)
- goto out2;
- fdt->fd = new_fds;
- fdt->max_fds = nfds;
- fdt->free_files = NULL;
return fdt;
-out2:
- nfds = fdt->max_fdset;
-out:
- free_fdset(new_openset, nfds);
- free_fdset(new_execset, nfds);
+
+out_arr:
+ free_fdarr(fdt);
+out_fdt:
kfree(fdt);
+out:
return NULL;
}
@@ -290,14 +201,17 @@ static int expand_fdtable(struct files_struct *files, int nr)
* we dropped the lock
*/
cur_fdt = files_fdtable(files);
- if (nr >= cur_fdt->max_fds || nr >= cur_fdt->max_fdset) {
+ if (nr >= cur_fdt->max_fds) {
/* Continue as planned */
copy_fdtable(new_fdt, cur_fdt);
rcu_assign_pointer(files->fdt, new_fdt);
- free_fdtable(cur_fdt);
+ if (cur_fdt->max_fds > NR_OPEN_DEFAULT)
+ call_rcu(&cur_fdt->rcu, free_fdtable_rcu);
} else {
/* Somebody else expanded, so undo our attempt */
- __free_fdtable(new_fdt);
+ free_fdarr(new_fdt);
+ free_fdset(new_fdt);
+ kfree(new_fdt);
}
return 1;
}
@@ -316,11 +230,10 @@ int expand_files(struct files_struct *files, int nr)
fdt = files_fdtable(files);
/* Do we need to expand? */
- if (nr < fdt->max_fdset && nr < fdt->max_fds)
+ if (nr < fdt->max_fds)
return 0;
/* Can we expand? */
- if (fdt->max_fdset >= NR_OPEN || fdt->max_fds >= NR_OPEN ||
- nr >= NR_OPEN)
+ if (nr >= NR_OPEN)
return -EMFILE;
/* All good, so we try */
diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c
index d38e0d575e4..cceaf57e377 100644
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -55,7 +55,7 @@ get_transaction(journal_t *journal, transaction_t *transaction)
spin_lock_init(&transaction->t_handle_lock);
/* Set up the commit timer for the new transaction. */
- journal->j_commit_timer.expires = transaction->t_expires;
+ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires);
add_timer(&journal->j_commit_timer);
J_ASSERT(journal->j_running_transaction == NULL);
diff --git a/fs/jfs/jfs_filsys.h b/fs/jfs/jfs_filsys.h
index eb550b339bb..38f70ac03be 100644
--- a/fs/jfs/jfs_filsys.h
+++ b/fs/jfs/jfs_filsys.h
@@ -29,31 +29,21 @@
/*
* file system option (superblock flag)
*/
-/* mount time flag to disable journaling to disk */
-#define JFS_NOINTEGRITY 0x00000010
+
+/* directory option */
+#define JFS_UNICODE 0x00000001 /* unicode name */
/* mount time flags for error handling */
#define JFS_ERR_REMOUNT_RO 0x00000002 /* remount read-only */
#define JFS_ERR_CONTINUE 0x00000004 /* continue */
#define JFS_ERR_PANIC 0x00000008 /* panic */
+/* Quota support */
#define JFS_USRQUOTA 0x00000010
#define JFS_GRPQUOTA 0x00000020
-/* platform option (conditional compilation) */
-#define JFS_AIX 0x80000000 /* AIX support */
-/* POSIX name/directory support */
-
-#define JFS_OS2 0x40000000 /* OS/2 support */
-/* case-insensitive name/directory support */
-
-#define JFS_DFS 0x20000000 /* DCE DFS LFS support */
-
-#define JFS_LINUX 0x10000000 /* Linux support */
-/* case-sensitive name/directory support */
-
-/* directory option */
-#define JFS_UNICODE 0x00000001 /* unicode name */
+/* mount time flag to disable journaling to disk */
+#define JFS_NOINTEGRITY 0x00000040
/* commit option */
#define JFS_COMMIT 0x00000f00 /* commit option mask */
@@ -61,6 +51,7 @@
#define JFS_LAZYCOMMIT 0x00000200 /* lazy commit */
#define JFS_TMPFS 0x00000400 /* temporary file system -
* do not log/commit:
+ * Never implemented
*/
/* log logical volume option */
@@ -74,16 +65,25 @@
#define JFS_SPARSE 0x00020000 /* sparse regular file */
/* DASD Limits F226941 */
-#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */
-#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */
+#define JFS_DASD_ENABLED 0x00040000 /* DASD limits enabled */
+#define JFS_DASD_PRIME 0x00080000 /* Prime DASD usage on boot */
/* big endian flag */
-#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */
+#define JFS_SWAP_BYTES 0x00100000 /* running on big endian computer */
/* Directory index */
-#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */
- /* directory entries */
+#define JFS_DIR_INDEX 0x00200000 /* Persistent index for */
+/* platform options */
+#define JFS_LINUX 0x10000000 /* Linux support */
+#define JFS_DFS 0x20000000 /* DCE DFS LFS support */
+/* Never implemented */
+
+#define JFS_OS2 0x40000000 /* OS/2 support */
+/* case-insensitive name/directory support */
+
+#define JFS_AIX 0x80000000 /* AIX support */
+/* POSIX name/directory support - Never implemented*/
/*
* buffer cache configuration
diff --git a/fs/open.c b/fs/open.c
index 0d94319e868..c989fb4cf7b 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -864,8 +864,7 @@ int get_unused_fd(void)
repeat:
fdt = files_fdtable(files);
- fd = find_next_zero_bit(fdt->open_fds->fds_bits,
- fdt->max_fdset,
+ fd = find_next_zero_bit(fdt->open_fds->fds_bits, fdt->max_fds,
files->next_fd);
/*
diff --git a/fs/proc/base.c b/fs/proc/base.c
index fd959d5b5a8..77a57b5799c 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -1804,6 +1804,27 @@ static int proc_base_fill_cache(struct file *filp, void *dirent, filldir_t filld
proc_base_instantiate, task, p);
}
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+static int proc_pid_io_accounting(struct task_struct *task, char *buffer)
+{
+ return sprintf(buffer,
+ "rchar: %llu\n"
+ "wchar: %llu\n"
+ "syscr: %llu\n"
+ "syscw: %llu\n"
+ "read_bytes: %llu\n"
+ "write_bytes: %llu\n"
+ "cancelled_write_bytes: %llu\n",
+ (unsigned long long)task->rchar,
+ (unsigned long long)task->wchar,
+ (unsigned long long)task->syscr,
+ (unsigned long long)task->syscw,
+ (unsigned long long)task->ioac.read_bytes,
+ (unsigned long long)task->ioac.write_bytes,
+ (unsigned long long)task->ioac.cancelled_write_bytes);
+}
+#endif
+
/*
* Thread groups
*/
@@ -1855,6 +1876,9 @@ static struct pid_entry tgid_base_stuff[] = {
#ifdef CONFIG_FAULT_INJECTION
REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject),
#endif
+#ifdef CONFIG_TASK_IO_ACCOUNTING
+ INF("io", S_IRUGO, pid_io_accounting),
+#endif
};
static int proc_tgid_base_readdir(struct file * filp,
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index dc3e580d1dc..92ea7743fe8 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -47,6 +47,7 @@
#include <linux/vmalloc.h>
#include <linux/crash_dump.h>
#include <linux/pid_namespace.h>
+#include <linux/compile.h>
#include <asm/uaccess.h>
#include <asm/pgtable.h>
#include <asm/io.h>
@@ -253,8 +254,15 @@ static int version_read_proc(char *page, char **start, off_t off,
{
int len;
- len = sprintf(page, linux_banner,
- utsname()->release, utsname()->version);
+ /* FIXED STRING! Don't touch! */
+ len = snprintf(page, PAGE_SIZE,
+ "%s version %s"
+ " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ")"
+ " (" LINUX_COMPILER ")"
+ " %s\n",
+ utsname()->sysname,
+ utsname()->release,
+ utsname()->version);
return proc_calc_metrics(page, start, off, count, eof, len);
}
diff --git a/fs/select.c b/fs/select.c
index dcbc1112b7e..fe0893afd93 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -311,7 +311,7 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
{
fd_set_bits fds;
void *bits;
- int ret, max_fdset;
+ int ret, max_fds;
unsigned int size;
struct fdtable *fdt;
/* Allocate small arguments on the stack to save memory and be faster */
@@ -321,13 +321,13 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
if (n < 0)
goto out_nofds;
- /* max_fdset can increase, so grab it once to avoid race */
+ /* max_fds can increase, so grab it once to avoid race */
rcu_read_lock();
fdt = files_fdtable(current->files);
- max_fdset = fdt->max_fdset;
+ max_fds = fdt->max_fds;
rcu_read_unlock();
- if (n > max_fdset)
- n = max_fdset;
+ if (n > max_fds)
+ n = max_fds;
/*
* We need 6 bitmaps (in/out/ex for both incoming and outgoing),
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8e6b56fc1ca..b56eb754e2d 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1406,7 +1406,7 @@ xfs_vm_direct_IO(
xfs_end_io_direct);
}
- if (unlikely(ret <= 0 && iocb->private))
+ if (unlikely(ret != -EIOCBQUEUED && iocb->private))
xfs_destroy_ioend(iocb->private);
return ret;
}