From 6d6cb0d688d0f262cb4fd5771648b0ac01d4f82c Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 8 Apr 2009 14:07:57 +0200 Subject: UBIFS: reset no_space flag after inode deletion When UBIFS runs out of space it spends a lot of time trying to find more space before returning ENOSPC. As there is no point repeating that unless something has changed, UBIFS has an optimization to record that the file system is 100% full and not try to find space. That flag was not being reset when a pending deletion was finally done. Signed-off-by: Adrian Hunter Reviewed-by: Artem Bityutskiy --- fs/ubifs/budget.c | 3 ++- fs/ubifs/super.c | 5 +++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index af1914462f0..d0231ba783d 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -628,7 +628,7 @@ void ubifs_convert_page_budget(struct ubifs_info *c) * * This function releases budget corresponding to a dirty inode. It is usually * called when after the inode has been written to the media and marked as - * clean. + * clean. It also causes the "no space" flags to be cleared. */ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_inode *ui) @@ -636,6 +636,7 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, struct ubifs_budget_req req; memset(&req, 0, sizeof(struct ubifs_budget_req)); + /* The "no space" flags will be cleared because dd_growth is > 0 */ req.dd_growth = c->inode_budget + ALIGN(ui->data_len, 8); ubifs_release_budget(c, &req); } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index faa44f90608..f2c1c0b79f6 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -360,6 +360,11 @@ static void ubifs_delete_inode(struct inode *inode) out: if (ui->dirty) ubifs_release_dirty_inode_budget(c, ui); + else { + /* We've deleted something - clean the "no space" flags */ + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } clear_inode(inode); } -- cgit v1.2.3-70-g09d2 From 8b3884a841f398f6e0a0411d6929d8d9381bb265 Mon Sep 17 00:00:00 2001 From: Hunter Adrian Date: Thu, 14 May 2009 06:32:30 +0200 Subject: UBIFS: return error if link and unlink race Consider a scenario when 'vfs_link(dirA/fileA)' and 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not lock 'dirA->i_mutex', so this is possible. Both of the functions lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' to the list of orphans. After this, 'vfs_link()' will link 'dirB/fileB' to 'inodeA'. Thir is a problem because, for example, the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode to the list of orphans. This problem was reported by J. R. Okajima [Artem: add more comments, amended commit message] Signed-off-by: Adrian Hunter Signed-off-by: Artem Bityutskiy --- fs/ubifs/dir.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index f55d523c52b..552fb0111ff 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -528,6 +528,25 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, inode->i_nlink, dir->i_ino); ubifs_assert(mutex_is_locked(&dir->i_mutex)); ubifs_assert(mutex_is_locked(&inode->i_mutex)); + + /* + * Return -ENOENT if we've raced with unlink and i_nlink is 0. Doing + * otherwise has the potential to corrupt the orphan inode list. + * + * Indeed, consider a scenario when 'vfs_link(dirA/fileA)' and + * 'vfs_unlink(dirA/fileA, dirB/fileB)' race. 'vfs_link()' does not + * lock 'dirA->i_mutex', so this is possible. Both of the functions + * lock 'fileA->i_mutex' though. Suppose 'vfs_unlink()' wins, and takes + * 'fileA->i_mutex' mutex first. Suppose 'fileA->i_nlink' is 1. In this + * case 'ubifs_unlink()' will drop the last reference, and put 'inodeA' + * to the list of orphans. After this, 'vfs_link()' will link + * 'dirB/fileB' to 'inodeA'. This is a problem because, for example, + * the subsequent 'vfs_unlink(dirB/fileB)' will add the same inode + * to the list of orphans. + */ + if (inode->i_nlink == 0) + return -ENOENT; + err = dbg_check_synced_i_size(inode); if (err) return err; -- cgit v1.2.3-70-g09d2 From 8eec2f36fb869f1e6d81d834bbbd487941222fc8 Mon Sep 17 00:00:00 2001 From: Corentin Chary Date: Mon, 25 May 2009 08:49:10 +0200 Subject: UBIFS: return proper error code if the compr is not present If the compressor is not present, mount_ubifs need to return an error code. This way ubifs_fill_super will stop and handle the error. Signed-off-by: Corentin Chary Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f2c1c0b79f6..052514ca279 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1187,6 +1187,7 @@ static int mount_ubifs(struct ubifs_info *c) if (!ubifs_compr_present(c->default_compr)) { ubifs_err("'compressor \"%s\" is not compiled in", ubifs_compr_name(c->default_compr)); + err = -ENOTSUPP; goto out_free; } -- cgit v1.2.3-70-g09d2 From 7c83f5cb551b2e5c4934933fda006636f7424123 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 May 2009 19:23:04 +0300 Subject: UBIFS: use anonymous device UBIFS has erroneuosly set 'sb->s_dev' to the UBI volume character device major/minor. This may lead to clashes if there is another FS mounted to a block device with the same major/minor numbers. User-space programs which use 'stat->st_dev' may get confused because of this. This problem was found by Al Viro. He also pointed the way to fix the problem - use 'set_anon_super()' and 'kill_anon_super()' VFS helpers. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 052514ca279..42b818daa16 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1945,7 +1945,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; sb->s_blocksize_bits = UBIFS_BLOCK_SHIFT; - sb->s_dev = c->vi.cdev; sb->s_maxbytes = c->max_inode_sz = key_max_inode_size(c); if (c->max_inode_sz > MAX_LFS_FILESIZE) sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE; @@ -1990,16 +1989,9 @@ out_free: static int sb_test(struct super_block *sb, void *data) { dev_t *dev = data; + struct ubifs_info *c = sb->s_fs_info; - return sb->s_dev == *dev; -} - -static int sb_set(struct super_block *sb, void *data) -{ - dev_t *dev = data; - - sb->s_dev = *dev; - return 0; + return c->vi.cdev == *dev; } static int ubifs_get_sb(struct file_system_type *fs_type, int flags, @@ -2027,7 +2019,7 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags, dbg_gen("opened ubi%d_%d", vi.ubi_num, vi.vol_id); - sb = sget(fs_type, &sb_test, &sb_set, &vi.cdev); + sb = sget(fs_type, &sb_test, &set_anon_super, &vi.cdev); if (IS_ERR(sb)) { err = PTR_ERR(sb); goto out_close; @@ -2068,16 +2060,11 @@ out_close: return err; } -static void ubifs_kill_sb(struct super_block *sb) -{ - generic_shutdown_super(sb); -} - static struct file_system_type ubifs_fs_type = { .name = "ubifs", .owner = THIS_MODULE, .get_sb = ubifs_get_sb, - .kill_sb = ubifs_kill_sb + .kill_sb = kill_anon_super, }; /* -- cgit v1.2.3-70-g09d2 From 428ff9d2e37d3a82af0f56b476f70c244cf550d1 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 May 2009 16:59:28 +0300 Subject: UBIFS: remove dead code UBIFS assumes that @c->min_io_size is 8 in case of NOR flash. This is because UBIFS alignes all nodes to 8-byte boundary, and maintaining @c->min_io_size introduced unnecessary complications. This patch removes senseless constructs like: if (c->min_io_size == 1) NOR-specific code Also, few commentaries amendments. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 1 - fs/ubifs/recovery.c | 31 ++++--------------------------- 2 files changed, 4 insertions(+), 28 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index d0231ba783d..eaf6d891d46 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -91,7 +91,6 @@ static int shrink_liability(struct ubifs_info *c, int nr_to_write) return nr_written; } - /** * run_gc - run garbage collector. * @c: UBIFS file-system description object diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c index 10662975d2e..805605250f1 100644 --- a/fs/ubifs/recovery.c +++ b/fs/ubifs/recovery.c @@ -343,33 +343,15 @@ int ubifs_write_rcvrd_mst_node(struct ubifs_info *c) * * This function returns %1 if @offs was in the last write to the LEB whose data * is in @buf, otherwise %0 is returned. The determination is made by checking - * for subsequent empty space starting from the next min_io_size boundary (or a - * bit less than the common header size if min_io_size is one). + * for subsequent empty space starting from the next @c->min_io_size boundary. */ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) { - int empty_offs; - int check_len; + int empty_offs, check_len; uint8_t *p; - if (c->min_io_size == 1) { - check_len = c->leb_size - offs; - p = buf + check_len; - for (; check_len > 0; check_len--) - if (*--p != 0xff) - break; - /* - * 'check_len' is the size of the corruption which cannot be - * more than the size of 1 node if it was caused by an unclean - * unmount. - */ - if (check_len > UBIFS_MAX_NODE_SZ) - return 0; - return 1; - } - /* - * Round up to the next c->min_io_size boundary i.e. 'offs' is in the + * Round up to the next @c->min_io_size boundary i.e. @offs is in the * last wbuf written. After that should be empty space. */ empty_offs = ALIGN(offs + 1, c->min_io_size); @@ -392,7 +374,7 @@ static int is_last_write(const struct ubifs_info *c, void *buf, int offs) * * This function pads up to the next min_io_size boundary (if there is one) and * sets empty space to all 0xff. @buf, @offs and @len are updated to the next - * min_io_size boundary (if there is one). + * @c->min_io_size boundary. */ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, int *offs, int *len) @@ -402,11 +384,6 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum, lnum = lnum; dbg_rcvry("cleaning corruption at %d:%d", lnum, *offs); - if (c->min_io_size == 1) { - memset(*buf, 0xff, c->leb_size - *offs); - return; - } - ubifs_assert(!(*offs & 7)); empty_offs = ALIGN(*offs, c->min_io_size); pad_len = empty_offs - *offs; -- cgit v1.2.3-70-g09d2 From 8379ea31e991ed2098660954d25f64386adee65c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 29 May 2009 12:34:52 +0300 Subject: UBIFS: allow sync option in rootflags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When passing UBIFS parameters via kernel command line, the sync option will be passed to UBIFS as a string, not as an MS_SYNCHRONOUS flag. Teach UBIFS interpreting this flag. Reported-by: Aurélien GÉRÔME Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 40 +++++++++++++++++++++++++++++++++++----- 1 file changed, 35 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 42b818daa16..d10fc88c7bb 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -939,6 +939,27 @@ static const match_table_t tokens = { {Opt_err, NULL}, }; +/** + * parse_standard_option - parse a standard mount option. + * @option: the option to parse + * + * Normally, standard mount options like "sync" are passed to file-systems as + * flags. However, when a "rootflags=" kernel boot parameter is used, they may + * be present in the options string. This function tries to deal with this + * situation and parse standard options. Returns 0 if the option was not + * recognized, and the corresponding integer flag if it was. + * + * UBIFS is only interested in the "sync" option, so do not check for anything + * else. + */ +static int parse_standard_option(const char *option) +{ + ubifs_msg("parse %s", option); + if (!strcmp(option, "sync")) + return MS_SYNCHRONOUS; + return 0; +} + /** * ubifs_parse_options - parse mount parameters. * @c: UBIFS file-system description object @@ -1015,9 +1036,19 @@ static int ubifs_parse_options(struct ubifs_info *c, char *options, break; } default: - ubifs_err("unrecognized mount option \"%s\" " - "or missing value", p); - return -EINVAL; + { + unsigned long flag; + struct super_block *sb = c->vfs_sb; + + flag = parse_standard_option(p); + if (!flag) { + ubifs_err("unrecognized mount option \"%s\" " + "or missing value", p); + return -EINVAL; + } + sb->s_flags |= flag; + break; + } } } @@ -1908,6 +1939,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&c->orph_list); INIT_LIST_HEAD(&c->orph_new); + c->vfs_sb = sb; c->highest_inum = UBIFS_FIRST_INO; c->lhead_lnum = c->ltail_lnum = UBIFS_LOG_LNUM; @@ -1939,8 +1971,6 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto out_bdi; - c->vfs_sb = sb; - sb->s_fs_info = c; sb->s_magic = UBIFS_SUPER_MAGIC; sb->s_blocksize = UBIFS_BLOCK_SIZE; -- cgit v1.2.3-70-g09d2 From 3f36406f26437afae9f43cc6dcfc264143e21ed0 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 29 May 2009 20:16:27 +0300 Subject: UBIFS: do not forget to register BDI device Reviewed-by: Jens Axboe Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index d10fc88c7bb..b9b051a4c01 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -1966,6 +1966,9 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent) err = bdi_init(&c->bdi); if (err) goto out_close; + err = bdi_register(&c->bdi, NULL, "ubifs"); + if (err) + goto out_bdi; err = ubifs_parse_options(c, data, 0); if (err) -- cgit v1.2.3-70-g09d2 From f2c5dbd7b7396457efc114f825acfdd4db4608f8 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Thu, 28 May 2009 16:24:15 +0300 Subject: UBIFS: start using hrtimers UBIFS uses timers for write-buffer write-back. It is not crucial for us to write-back exactly on time. We are fine to write-back a little earlier or later. And this means we may optimize UBIFS timer so that it could be groped with a close timer event, so that the CPU would not be waken up just to do the write back. This is optimization to lessen power consumption, which is important in embedded devices UBIFS is used for. hrtimers have a nice feature: they are effectively range timers, and we may defind the soft and hard limits for it. Standard timers do not have these feature. They may only be made deferrable, but this means there is effectively no hard limit. So, we will better use hrtimers. Signed-off-by: Artem Bityutskiy --- fs/ubifs/io.c | 34 +++++++++++++++++++++------------- fs/ubifs/super.c | 6 +++--- fs/ubifs/ubifs.h | 13 ++++++++----- 3 files changed, 32 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index e8e632a1dcd..bc5857199ec 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -293,13 +293,14 @@ void ubifs_prep_grp_node(struct ubifs_info *c, void *node, int len, int last) * * This function is called when the write-buffer timer expires. */ -static void wbuf_timer_callback_nolock(unsigned long data) +static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer) { - struct ubifs_wbuf *wbuf = (struct ubifs_wbuf *)data; + struct ubifs_wbuf *wbuf = container_of(timer, struct ubifs_wbuf, timer); wbuf->need_sync = 1; wbuf->c->need_wbuf_sync = 1; ubifs_wake_up_bgt(wbuf->c); + return HRTIMER_NORESTART; } /** @@ -308,13 +309,12 @@ static void wbuf_timer_callback_nolock(unsigned long data) */ static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) { - ubifs_assert(!timer_pending(&wbuf->timer)); + ubifs_assert(!hrtimer_active(&wbuf->timer)); - if (!wbuf->timeout) + if (!ktime_to_ns(wbuf->softlimit)) return; - - wbuf->timer.expires = jiffies + wbuf->timeout; - add_timer(&wbuf->timer); + hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta, + HRTIMER_MODE_REL); } /** @@ -329,7 +329,7 @@ static void cancel_wbuf_timer_nolock(struct ubifs_wbuf *wbuf) * should be canceled. */ wbuf->need_sync = 0; - del_timer(&wbuf->timer); + hrtimer_cancel(&wbuf->timer); } /** @@ -825,6 +825,7 @@ out: int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) { size_t size; + ktime_t hardlimit; wbuf->buf = kmalloc(c->min_io_size, GFP_KERNEL); if (!wbuf->buf) @@ -845,14 +846,21 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->sync_callback = NULL; mutex_init(&wbuf->io_mutex); spin_lock_init(&wbuf->lock); - wbuf->c = c; - init_timer(&wbuf->timer); - wbuf->timer.function = wbuf_timer_callback_nolock; - wbuf->timer.data = (unsigned long)wbuf; - wbuf->timeout = DEFAULT_WBUF_TIMEOUT; wbuf->next_ino = 0; + hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + wbuf->timer.function = wbuf_timer_callback_nolock; + /* + * Make write-buffer soft limit to be 20% of the hard limit. The + * write-buffer timer is allowed to expire any time between the soft + * and hard limits. + */ + hardlimit = ktime_set(DEFAULT_WBUF_TIMEOUT_SECS, 0); + wbuf->delta = (DEFAULT_WBUF_TIMEOUT_SECS * NSEC_PER_SEC) * 2 / 10; + wbuf->softlimit = ktime_sub_ns(hardlimit, wbuf->delta); + hrtimer_set_expires_range_ns(&wbuf->timer, wbuf->softlimit, + wbuf->delta); return 0; } diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index b9b051a4c01..91c91cb7a59 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -799,7 +799,7 @@ static int alloc_wbufs(struct ubifs_info *c) * does not need to be synchronized by timer. */ c->jheads[GCHD].wbuf.dtype = UBI_LONGTERM; - c->jheads[GCHD].wbuf.timeout = 0; + c->jheads[GCHD].wbuf.softlimit = ktime_set(0, 0); return 0; } @@ -1695,7 +1695,7 @@ static void ubifs_remount_ro(struct ubifs_info *c) for (i = 0; i < c->jhead_cnt; i++) { ubifs_wbuf_sync(&c->jheads[i].wbuf); - del_timer_sync(&c->jheads[i].wbuf.timer); + hrtimer_cancel(&c->jheads[i].wbuf.timer); } c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_DIRTY); @@ -1755,7 +1755,7 @@ static void ubifs_put_super(struct super_block *sb) if (c->jheads) for (i = 0; i < c->jhead_cnt; i++) { ubifs_wbuf_sync(&c->jheads[i].wbuf); - del_timer_sync(&c->jheads[i].wbuf.timer); + hrtimer_cancel(&c->jheads[i].wbuf.timer); } /* diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 0a8341e1408..1bf01d82006 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -95,8 +95,8 @@ */ #define BGT_NAME_PATTERN "ubifs_bgt%d_%d" -/* Default write-buffer synchronization timeout (5 secs) */ -#define DEFAULT_WBUF_TIMEOUT (5 * HZ) +/* Default write-buffer synchronization timeout in seconds */ +#define DEFAULT_WBUF_TIMEOUT_SECS 5 /* Maximum possible inode number (only 32-bit inodes are supported now) */ #define MAX_INUM 0xFFFFFFFF @@ -650,8 +650,10 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c, * @io_mutex: serializes write-buffer I/O * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes * fields + * @softlimit: soft write-buffer timeout interval + * @delta: hard and soft timeouts delta (the timer expire inteval is @softlimit + * and @softlimit + @delta) * @timer: write-buffer timer - * @timeout: timer expire interval in jiffies * @need_sync: it is set if its timer expired and needs sync * @next_ino: points to the next position of the following inode number * @inodes: stores the inode numbers of the nodes which are in wbuf @@ -678,8 +680,9 @@ struct ubifs_wbuf { int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad); struct mutex io_mutex; spinlock_t lock; - struct timer_list timer; - int timeout; + ktime_t softlimit; + unsigned long long delta; + struct hrtimer timer; int need_sync; int next_ino; ino_t *inodes; -- cgit v1.2.3-70-g09d2 From df59c0ad05182329688e514e5a9c3836fa208ea3 Mon Sep 17 00:00:00 2001 From: Alexey Zaytsev Date: Wed, 10 Jun 2009 12:56:56 -0700 Subject: [SCSI] compat: don't perform unneeded copy in sg_io code The members from 'status' in struct sg_io_hdr to the last are used to transfer information from kernel to user space. The values that user space sets are just ignored. Signed-off-by: Alexey Zaytsev Acked-by: Jens Axboe Acked-by: FUJITA Tomonori Signed-off-by: Andrew Morton Signed-off-by: James Bottomley --- fs/compat_ioctl.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index b83f6bcfa51..905523cc281 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -788,12 +788,6 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, unsigned long arg) if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; - if (copy_in_user(&sgio->status, &sgio32->status, - (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned short)) + - (3 * sizeof(int)))) - return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) sgio); if (err >= 0) { -- cgit v1.2.3-70-g09d2 From 557411eb2ce61ef5e87bd759a6f86881586df857 Mon Sep 17 00:00:00 2001 From: Armin Kuster Date: Wed, 29 Apr 2009 07:29:59 -1000 Subject: Sysfs: fix possible memleak in sysfs_follow_link There is the possiblity of a memory leak if a page is allocated and if sysfs_getlink() fails in the sysfs_follow_link. Signed-off-by: Armin Kuster Signed-off-by: Greg Kroah-Hartman --- fs/sysfs/symlink.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/sysfs/symlink.c b/fs/sysfs/symlink.c index a3ba217fbe7..1d897ad808e 100644 --- a/fs/sysfs/symlink.c +++ b/fs/sysfs/symlink.c @@ -192,8 +192,11 @@ static void *sysfs_follow_link(struct dentry *dentry, struct nameidata *nd) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) + if (page) { error = sysfs_getlink(dentry, (char *) page); + if (error < 0) + free_page((unsigned long)page); + } nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); return NULL; } -- cgit v1.2.3-70-g09d2 From 56a83cc92991ed5bf76e224dd2ad53b5e9c00681 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 25 Apr 2009 00:39:40 -0400 Subject: debugfs: dont stop on first failed recursive delete debugfs: dont stop on first failed recursive delete While running a while loop of removing a module that removes a debugfs directory with debugfs_remove_recursive, and at the same time doing a while loop of cat of a file in that directory, I would hit a point where somehow the cat of the file caused the remove to fail. The result is that other files did not get removed when the module was removed. I simple read of one of those file can oops the kernel because the operations to the file no longer exist (removed by module). The funny thing is that the file being cat'ed was removed. It was the siblings that were not. I see in the code to debugfs_remove_recursive there's a test that checks if the child fails to bail out of the loop to prevent an infinite loop. What this patch does is to still try any siblings in that directory. If all the siblings fail, or there are no more siblings, then we exit the loop. This fixes the above symptom, but... This is no full proof. It makes the debugfs_remove_recursive a bit more robust, but it does not explain why the one file failed. There may be some kind of delay deletion that makes the debugfs think it did not succeed. So this patch is more of a fix for the symptom but not the disease. This patch still makes the debugfs_remove_recursive more robust and until I can find out why the bug exists, this patch will keep the kernel from oopsing in most cases. Even after the cause is found I think this change can stand on its own and should be kept. [ Impact: prevent kernel oops on module unload and reading debugfs files ] Signed-off-by: Steven Rostedt Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/inode.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 0662ba6de85..d22438ef767 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -403,6 +403,7 @@ void debugfs_remove_recursive(struct dentry *dentry) } child = list_entry(parent->d_subdirs.next, struct dentry, d_u.d_child); + next_sibling: /* * If "child" isn't empty, walk down the tree and @@ -416,6 +417,16 @@ void debugfs_remove_recursive(struct dentry *dentry) } __debugfs_remove(child, parent); if (parent->d_subdirs.next == &child->d_u.d_child) { + /* + * Try the next sibling. + */ + if (child->d_u.d_child.next != &parent->d_subdirs) { + child = list_entry(child->d_u.d_child.next, + struct dentry, + d_u.d_child); + goto next_sibling; + } + /* * Avoid infinite loop if we fail to remove * one dentry. -- cgit v1.2.3-70-g09d2 From 400ced61fa4914457d7e0a38e7c0fc6fd208694b Mon Sep 17 00:00:00 2001 From: Jonathan Corbet Date: Mon, 25 May 2009 10:15:27 -0600 Subject: debugfs: fix docbook error Fix an error in debugfs_create_blob's docbook description It cannot actually be used to write a binary blob. Signed-off-by: Jonathan Corbet --- fs/debugfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 33a90120f6a..39a619c222f 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -419,7 +419,7 @@ static const struct file_operations fops_blob = { }; /** - * debugfs_create_blob - create a debugfs file that is used to read and write a binary blob + * debugfs_create_blob - create a debugfs file that is used to read a binary blob * @name: a pointer to a string containing the name of the file to create. * @mode: the permission that the file should have * @parent: a pointer to the parent dentry for this file. This should be a -- cgit v1.2.3-70-g09d2 From e4792aa30f9d33584d7192685ed149cc5fee737f Mon Sep 17 00:00:00 2001 From: Robin Getz Date: Tue, 2 Jun 2009 03:00:47 -0400 Subject: debugfs: use specified mode to possibly mark files read/write only In many SoC implementations there are hardware registers can be read or write only. This extends the debugfs to enforce the file permissions for these types of registers by providing a set of fops which are read or write only. This assumes that the kernel developer knows more about the hardware than the user (even root users) -- which is normally true. Signed-off-by: Robin Getz Signed-off-by: Mike Frysinger Signed-off-by: Bryan Wu Signed-off-by: Greg Kroah-Hartman --- fs/debugfs/file.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) (limited to 'fs') diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 39a619c222f..4d74fc72c19 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -67,6 +67,8 @@ static int debugfs_u8_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_ro, debugfs_u8_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u8_wo, NULL, debugfs_u8_set, "%llu\n"); /** * debugfs_create_u8 - create a debugfs file that is used to read and write an unsigned 8-bit value @@ -95,6 +97,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u8, debugfs_u8_get, debugfs_u8_set, "%llu\n"); struct dentry *debugfs_create_u8(const char *name, mode_t mode, struct dentry *parent, u8 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u8_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u8); } EXPORT_SYMBOL_GPL(debugfs_create_u8); @@ -110,6 +119,8 @@ static int debugfs_u16_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_ro, debugfs_u16_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u16_wo, NULL, debugfs_u16_set, "%llu\n"); /** * debugfs_create_u16 - create a debugfs file that is used to read and write an unsigned 16-bit value @@ -138,6 +149,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u16, debugfs_u16_get, debugfs_u16_set, "%llu\n"); struct dentry *debugfs_create_u16(const char *name, mode_t mode, struct dentry *parent, u16 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u16_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u16); } EXPORT_SYMBOL_GPL(debugfs_create_u16); @@ -153,6 +171,8 @@ static int debugfs_u32_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_ro, debugfs_u32_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u32_wo, NULL, debugfs_u32_set, "%llu\n"); /** * debugfs_create_u32 - create a debugfs file that is used to read and write an unsigned 32-bit value @@ -181,6 +201,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u32, debugfs_u32_get, debugfs_u32_set, "%llu\n"); struct dentry *debugfs_create_u32(const char *name, mode_t mode, struct dentry *parent, u32 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u32_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u32); } EXPORT_SYMBOL_GPL(debugfs_create_u32); @@ -197,6 +224,8 @@ static int debugfs_u64_get(void *data, u64 *val) return 0; } DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_ro, debugfs_u64_get, NULL, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_u64_wo, NULL, debugfs_u64_set, "%llu\n"); /** * debugfs_create_u64 - create a debugfs file that is used to read and write an unsigned 64-bit value @@ -225,15 +254,28 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_u64, debugfs_u64_get, debugfs_u64_set, "%llu\n"); struct dentry *debugfs_create_u64(const char *name, mode_t mode, struct dentry *parent, u64 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_u64_wo); + return debugfs_create_file(name, mode, parent, value, &fops_u64); } EXPORT_SYMBOL_GPL(debugfs_create_u64); DEFINE_SIMPLE_ATTRIBUTE(fops_x8, debugfs_u8_get, debugfs_u8_set, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_ro, debugfs_u8_get, NULL, "0x%02llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x8_wo, NULL, debugfs_u8_set, "0x%02llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x16, debugfs_u16_get, debugfs_u16_set, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_ro, debugfs_u16_get, NULL, "0x%04llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x16_wo, NULL, debugfs_u16_set, "0x%04llx\n"); DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_ro, debugfs_u32_get, NULL, "0x%08llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_x32_wo, NULL, debugfs_u32_set, "0x%08llx\n"); /* * debugfs_create_x{8,16,32} - create a debugfs file that is used to read and write an unsigned {8,16,32}-bit value @@ -256,6 +298,13 @@ DEFINE_SIMPLE_ATTRIBUTE(fops_x32, debugfs_u32_get, debugfs_u32_set, "0x%08llx\n" struct dentry *debugfs_create_x8(const char *name, mode_t mode, struct dentry *parent, u8 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x8_wo); + return debugfs_create_file(name, mode, parent, value, &fops_x8); } EXPORT_SYMBOL_GPL(debugfs_create_x8); @@ -273,6 +322,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x8); struct dentry *debugfs_create_x16(const char *name, mode_t mode, struct dentry *parent, u16 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x16_wo); + return debugfs_create_file(name, mode, parent, value, &fops_x16); } EXPORT_SYMBOL_GPL(debugfs_create_x16); @@ -290,6 +346,13 @@ EXPORT_SYMBOL_GPL(debugfs_create_x16); struct dentry *debugfs_create_x32(const char *name, mode_t mode, struct dentry *parent, u32 *value) { + /* if there are no write bits set, make read only */ + if (!(mode & S_IWUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_ro); + /* if there are no read bits set, make write only */ + if (!(mode & S_IRUGO)) + return debugfs_create_file(name, mode, parent, value, &fops_x32_wo); + return debugfs_create_file(name, mode, parent, value, &fops_x32); } EXPORT_SYMBOL_GPL(debugfs_create_x32); -- cgit v1.2.3-70-g09d2 From e27ecdd94d81e5bc3d1f68591701db5adb342f0d Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Fri, 24 Apr 2009 10:11:40 +0200 Subject: nls: utf8_wcstombs: use correct buffer size in error case When utf8_wcstombs encounters a character that cannot be encoded, we must not decrease the remaining output buffer size because nothing has been written to the output buffer. Signed-off-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- fs/nls/nls_base.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 9b0efdad891..000736d89c9 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -144,7 +144,6 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) size = utf8_wctomb(op, *ip, maxlen); if (size == -1) { /* Ignore character and move on */ - maxlen--; } else { op += size; maxlen -= size; -- cgit v1.2.3-70-g09d2 From 905c02acbd89f427c87a6d0a50fed757f6b3001c Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Fri, 24 Apr 2009 10:11:56 +0200 Subject: nls: utf8_wcstombs: fix buffer overflow utf8_wcstombs forgot to include one-byte UTF-8 characters when calculating the output buffer size, i.e., theoretically, it was possible to overflow the output buffer with an input string that contains enough ASCII characters. In practice, this was no problem because the only user so far (VFAT) always uses a big enough output buffer. Signed-off-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- fs/nls/nls_base.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 000736d89c9..750abf211e2 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -150,6 +150,7 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) } } else { *op++ = (__u8) *ip; + maxlen--; } ip++; } -- cgit v1.2.3-70-g09d2 From 74675a58507e769beee7d949dbed788af3c4139d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 30 Apr 2009 10:08:18 -0400 Subject: NLS: update handling of Unicode This patch (as1239) updates the kernel's treatment of Unicode. The character-set conversion routines are well behind the current state of the Unicode specification: They don't recognize the existence of code points beyond plane 0 or of surrogate pairs in the UTF-16 encoding. The old wchar_t 16-bit type is retained because it's still used in lots of places. This shouldn't cause any new problems; if a conversion now results in an invalid 16-bit code then before it must have yielded an undefined code. Difficult-to-read names like "utf_mbstowcs" are replaced with more transparent names like "utf8s_to_utf16s" and the ordering of the parameters is rationalized (buffer lengths come immediate after the pointers they refer to, and the inputs precede the outputs). Fortunately the low-level conversion routines are used in only a few places; the interfaces to the higher-level uni2char and char2uni methods have been left unchanged. Signed-off-by: Alan Stern Acked-by: Clemens Ladisch Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/message.c | 10 +-- fs/befs/linuxvfs.c | 20 +++--- fs/fat/dir.c | 29 ++++---- fs/fat/namei_vfat.c | 4 +- fs/isofs/joliet.c | 36 +--------- fs/ncpfs/ncplib_kernel.c | 8 ++- fs/nls/nls_base.c | 164 +++++++++++++++++++++++++++++---------------- fs/nls/nls_utf8.c | 13 +++- include/linux/nls.h | 35 ++++++++-- 9 files changed, 182 insertions(+), 137 deletions(-) (limited to 'fs') diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c index e98f928c08e..9bd26dec759 100644 --- a/drivers/usb/core/message.c +++ b/drivers/usb/core/message.c @@ -780,14 +780,13 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size) { unsigned char *tbuf; int err; - unsigned int u; if (dev->state == USB_STATE_SUSPENDED) return -EHOSTUNREACH; if (size <= 0 || !buf || !index) return -EINVAL; buf[0] = 0; - tbuf = kmalloc(256 + 2, GFP_NOIO); + tbuf = kmalloc(256, GFP_NOIO); if (!tbuf) return -ENOMEM; @@ -814,12 +813,9 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size) if (err < 0) goto errout; - for (u = 2; u < err; u += 2) - le16_to_cpus((u16 *)&tbuf[u]); - tbuf[u] = 0; - tbuf[u + 1] = 0; size--; /* leave room for trailing NULL char in output buffer */ - err = utf8_wcstombs(buf, (u16 *)&tbuf[2], size); + err = utf16s_to_utf8s((wchar_t *) &tbuf[2], (err - 2) / 2, + UTF16_LITTLE_ENDIAN, buf, size); buf[err] = 0; if (tbuf[1] != USB_DT_STRING) diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9367b6297d8..89cd2deeb4a 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in, { struct nls_table *nls = BEFS_SB(sb)->nls; int i, o; - wchar_t uni; + unicode_t uni; int unilen, utflen; char *result; /* The utf8->nls conversion won't make the final nls string bigger @@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in, for (i = o = 0; i < in_len; i += utflen, o += unilen) { /* convert from UTF-8 to Unicode */ - utflen = utf8_mbtowc(&uni, &in[i], in_len - i); - if (utflen < 0) { + utflen = utf8_to_utf32(&in[i], in_len - i, &uni); + if (utflen < 0) goto conv_err; - } /* convert from Unicode to nls */ + if (uni > MAX_WCHAR_T) + goto conv_err; unilen = nls->uni2char(uni, &result[o], in_len - o); - if (unilen < 0) { + if (unilen < 0) goto conv_err; - } } result[o] = '\0'; *out_len = o; @@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in, /* convert from nls to unicode */ unilen = nls->char2uni(&in[i], in_len - i, &uni); - if (unilen < 0) { + if (unilen < 0) goto conv_err; - } /* convert from unicode to UTF-8 */ - utflen = utf8_wctomb(&result[o], uni, 3); - if (utflen <= 0) { + utflen = utf32_to_utf8(uni, &result[o], 3); + if (utflen <= 0) goto conv_err; - } } result[o] = '\0'; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index f3500294eec..7c14c8cbbab 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -22,6 +22,19 @@ #include #include "fat.h" +/* + * Maximum buffer size of short name. + * [(MSDOS_NAME + '.') * max one char + nul] + * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] + */ +#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) +/* + * Maximum buffer size of unicode chars from slots. + * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] + */ +#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) +#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) + static inline loff_t fat_make_i_pos(struct super_block *sb, struct buffer_head *bh, struct msdos_dir_entry *de) @@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni, unsigned char *buf, int size) { if (sbi->options.utf8) - return utf8_wcstombs(buf, uni, size); + return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS, + UTF16_HOST_ENDIAN, buf, size); else return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate, sbi->nls_io); @@ -324,19 +338,6 @@ parse_long: return 0; } -/* - * Maximum buffer size of short name. - * [(MSDOS_NAME + '.') * max one char + nul] - * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul] - */ -#define FAT_MAX_SHORT_SIZE ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1) -/* - * Maximum buffer size of unicode chars from slots. - * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)] - */ -#define FAT_MAX_UNI_CHARS ((MSDOS_SLOTS - 1) * 13 + 1) -#define FAT_MAX_UNI_SIZE (FAT_MAX_UNI_CHARS * sizeof(wchar_t)) - /* * Return values: negative -> error, 0 -> not found, positive -> found, * value is the total amount of slots, including the shortname entry. diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index b50ecbe97f8..f92ad999535 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, if (utf8) { int name_len = strlen(name); - *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX); + *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname); /* * We stripped '.'s before and set len appropriately, - * but utf8_mbstowcs doesn't care about len + * but utf8s_to_utf16s doesn't care about len */ *outlen -= (name_len - len); diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c index 92c14b850e9..a048de81c09 100644 --- a/fs/isofs/joliet.c +++ b/fs/isofs/joliet.c @@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls) return (op - ascii); } -/* Convert big endian wide character string to utf8 */ -static int -wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen) -{ - const __u8 *ip; - __u8 *op; - int size; - __u16 c; - - op = s; - ip = pwcs; - while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) { - c = (*ip << 8) | ip[1]; - if (c > 0x7f) { - size = utf8_wctomb(op, c, maxlen); - if (size == -1) { - /* Ignore character and move on */ - maxlen--; - } else { - op += size; - maxlen -= size; - } - } else { - *op++ = (__u8) c; - } - ip += 2; - inlen--; - } - return (op - s); -} - int get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode) { @@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset; if (utf8) { - len = wcsntombs_be(outname, de->name, - de->name_len[0] >> 1, PAGE_SIZE); + len = utf16s_to_utf8s((const wchar_t *) de->name, + de->name_len[0] >> 1, UTF16_BIG_ENDIAN, + outname, PAGE_SIZE); } else { len = uni16_to_x8(outname, (__be16 *) de->name, de->name_len[0] >> 1, nls); diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index 97645f11211..0ec6237a597 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen, if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { int k; + unicode_t u; - k = utf8_mbtowc(&ec, iname, iname_end - iname); - if (k < 0) + k = utf8_to_utf32(iname, iname_end - iname, &u); + if (k < 0 || u > MAX_WCHAR_T) return -EINVAL; iname += k; + ec = u; } else { if (*iname == NCP_ESC) { int k; @@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen, if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) { int k; - k = utf8_wctomb(iname, ec, iname_end - iname); + k = utf32_to_utf8(ec, iname, iname_end - iname); if (k < 0) { err = -ENAMETOOLONG; goto quit; diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 750abf211e2..477d37d83b3 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -15,6 +15,7 @@ #include #include #include +#include static struct nls_table default_table; static struct nls_table *tables = &default_table; @@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] = {0, /* end of table */} }; -int -utf8_mbtowc(wchar_t *p, const __u8 *s, int n) +#define UNICODE_MAX 0x0010ffff +#define PLANE_SIZE 0x00010000 + +#define SURROGATE_MASK 0xfffff800 +#define SURROGATE_PAIR 0x0000d800 +#define SURROGATE_LOW 0x00000400 +#define SURROGATE_BITS 0x000003ff + +int utf8_to_utf32(const u8 *s, int len, unicode_t *pu) { - long l; + unsigned long l; int c0, c, nc; const struct utf8_table *t; @@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) nc++; if ((c0 & t->cmask) == t->cval) { l &= t->lmask; - if (l < t->lval) + if (l < t->lval || l > UNICODE_MAX || + (l & SURROGATE_MASK) == SURROGATE_PAIR) return -1; - *p = l; + *pu = (unicode_t) l; return nc; } - if (n <= nc) + if (len <= nc) return -1; s++; c = (*s ^ 0x80) & 0xFF; @@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n) } return -1; } +EXPORT_SYMBOL(utf8_to_utf32); -int -utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n) +int utf32_to_utf8(unicode_t u, u8 *s, int maxlen) { - __u16 *op; - const __u8 *ip; - int size; - - op = pwcs; - ip = s; - while (*ip && n > 0) { - if (*ip & 0x80) { - size = utf8_mbtowc(op, ip, n); - if (size == -1) { - /* Ignore character and move on */ - ip++; - n--; - } else { - op++; - ip += size; - n -= size; - } - } else { - *op++ = *ip++; - n--; - } - } - return (op - pwcs); -} - -int -utf8_wctomb(__u8 *s, wchar_t wc, int maxlen) -{ - long l; + unsigned long l; int c, nc; const struct utf8_table *t; - + if (!s) return 0; - - l = wc; + + l = u; + if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR) + return -1; + nc = 0; for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) { nc++; if (l <= t->lmask) { c = t->shift; - *s = t->cval | (l >> c); + *s = (u8) (t->cval | (l >> c)); while (c > 0) { c -= 6; s++; - *s = 0x80 | ((l >> c) & 0x3F); + *s = (u8) (0x80 | ((l >> c) & 0x3F)); } return nc; } } return -1; } +EXPORT_SYMBOL(utf32_to_utf8); -int -utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) +int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs) { - const __u16 *ip; - __u8 *op; + u16 *op; int size; + unicode_t u; + + op = pwcs; + while (*s && len > 0) { + if (*s & 0x80) { + size = utf8_to_utf32(s, len, &u); + if (size < 0) { + /* Ignore character and move on */ + size = 1; + } else if (u >= PLANE_SIZE) { + u -= PLANE_SIZE; + *op++ = (wchar_t) (SURROGATE_PAIR | + ((u >> 10) & SURROGATE_BITS)); + *op++ = (wchar_t) (SURROGATE_PAIR | + SURROGATE_LOW | + (u & SURROGATE_BITS)); + } else { + *op++ = (wchar_t) u; + } + s += size; + len -= size; + } else { + *op++ = *s++; + len--; + } + } + return op - pwcs; +} +EXPORT_SYMBOL(utf8s_to_utf16s); + +static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian) +{ + switch (endian) { + default: + return c; + case UTF16_LITTLE_ENDIAN: + return __le16_to_cpu(c); + case UTF16_BIG_ENDIAN: + return __be16_to_cpu(c); + } +} + +int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian, + u8 *s, int maxlen) +{ + u8 *op; + int size; + unsigned long u, v; op = s; - ip = pwcs; - while (*ip && maxlen > 0) { - if (*ip > 0x7f) { - size = utf8_wctomb(op, *ip, maxlen); + while (len > 0 && maxlen > 0) { + u = get_utf16(*pwcs, endian); + if (!u) + break; + pwcs++; + len--; + if (u > 0x7f) { + if ((u & SURROGATE_MASK) == SURROGATE_PAIR) { + if (u & SURROGATE_LOW) { + /* Ignore character and move on */ + continue; + } + if (len <= 0) + break; + v = get_utf16(*pwcs, endian); + if ((v & SURROGATE_MASK) != SURROGATE_PAIR || + !(v & SURROGATE_LOW)) { + /* Ignore character and move on */ + continue; + } + u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10) + + (v & SURROGATE_BITS); + pwcs++; + len--; + } + size = utf32_to_utf8(u, op, maxlen); if (size == -1) { /* Ignore character and move on */ } else { @@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen) maxlen -= size; } } else { - *op++ = (__u8) *ip; + *op++ = (u8) u; maxlen--; } - ip++; } - return (op - s); + return op - s; } +EXPORT_SYMBOL(utf16s_to_utf8s); int register_nls(struct nls_table * nls) { @@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls); EXPORT_SYMBOL(unload_nls); EXPORT_SYMBOL(load_nls); EXPORT_SYMBOL(load_nls_default); -EXPORT_SYMBOL(utf8_mbtowc); -EXPORT_SYMBOL(utf8_mbstowcs); -EXPORT_SYMBOL(utf8_wctomb); -EXPORT_SYMBOL(utf8_wcstombs); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c index aa2c42fdd97..0d60a44acac 100644 --- a/fs/nls/nls_utf8.c +++ b/fs/nls/nls_utf8.c @@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) { int n; - if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) { + if (boundlen <= 0) + return -ENAMETOOLONG; + + n = utf32_to_utf8(uni, out, boundlen); + if (n < 0) { *out = '?'; return -EINVAL; } @@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen) static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni) { int n; + unicode_t u; - if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) { + n = utf8_to_utf32(rawstring, boundlen, &u); + if (n < 0 || u > MAX_WCHAR_T) { *uni = 0x003f; /* ? */ - n = -EINVAL; + return -EINVAL; } + *uni = (wchar_t) u; return n; } diff --git a/include/linux/nls.h b/include/linux/nls.h index 52b1a76c1b4..d47beef08df 100644 --- a/include/linux/nls.h +++ b/include/linux/nls.h @@ -3,8 +3,23 @@ #include -/* unicode character */ -typedef __u16 wchar_t; +/* Unicode has changed over the years. Unicode code points no longer + * fit into 16 bits; as of Unicode 5 valid code points range from 0 + * to 0x10ffff (17 planes, where each plane holds 65536 code points). + * + * The original decision to represent Unicode characters as 16-bit + * wchar_t values is now outdated. But plane 0 still includes the + * most commonly used characters, so we will retain it. The newer + * 32-bit unicode_t type can be used when it is necessary to + * represent the full Unicode character set. + */ + +/* Plane-0 Unicode character */ +typedef u16 wchar_t; +#define MAX_WCHAR_T 0xffff + +/* Arbitrary Unicode character */ +typedef u32 unicode_t; struct nls_table { const char *charset; @@ -21,6 +36,13 @@ struct nls_table { /* this value hold the maximum octet of charset */ #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */ +/* Byte order for UTF-16 strings */ +enum utf16_endian { + UTF16_HOST_ENDIAN, + UTF16_LITTLE_ENDIAN, + UTF16_BIG_ENDIAN +}; + /* nls.c */ extern int register_nls(struct nls_table *); extern int unregister_nls(struct nls_table *); @@ -28,10 +50,11 @@ extern struct nls_table *load_nls(char *); extern void unload_nls(struct nls_table *); extern struct nls_table *load_nls_default(void); -extern int utf8_mbtowc(wchar_t *, const __u8 *, int); -extern int utf8_mbstowcs(wchar_t *, const __u8 *, int); -extern int utf8_wctomb(__u8 *, wchar_t, int); -extern int utf8_wcstombs(__u8 *, const wchar_t *, int); +extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu); +extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen); +extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs); +extern int utf16s_to_utf8s(const wchar_t *pwcs, int len, + enum utf16_endian endian, u8 *s, int maxlen); static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c) { -- cgit v1.2.3-70-g09d2 From f7c52fd17a7dda42fc9e88c2b2678403419bfe63 Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Tue, 16 Jun 2009 13:43:22 -0500 Subject: jfs: fix regression preventing coalescing of extents Commit fec1878fe952b994125a3be7c94b1322db586f3b caused a regression in which contiguous blocks being allocated to the end of an extent were getting a new extent created. This typically results in files entirely made up of 1-block extents even though the blocks are contiguous on disk. Apparently grub doesn't handle a jfs file being fragmented into too many extents, since it refuses to boot a kernel from jfs that was created by the 2.6.30 kernel. Signed-off-by: Dave Kleikamp Reported-by: Alex --- fs/jfs/jfs_extent.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index bbbd5f202e3..41d6045dbeb 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -391,6 +391,7 @@ int extHint(struct inode *ip, s64 offset, xad_t * xp) } XADaddress(xp, xaddr); XADlength(xp, xlen); + XADoffset(xp, prev); /* * only preserve the abnr flag within the xad flags * of the returned hint. -- cgit v1.2.3-70-g09d2 From 2f38d70fb4e97e7d00e12eaac45790cf6ebd7b22 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 16 Jun 2009 22:07:46 +0200 Subject: shift current_cred() from __f_setown() to f_modown() Shift current_cred() from __f_setown() to f_modown(). This reduces the number of arguments and saves 48 bytes from fs/fcntl.o. [ Note: this doesn't clear euid/uid when pid is set to NULL. But if f_owner.pid == NULL we never use f_owner.uid/euid. Otherwise we'd have a bug anyway: we must not send signals if pid was reset to NULL. ] Signed-off-by: Oleg Nesterov Acked-by: David Howells Signed-off-by: Linus Torvalds --- fs/fcntl.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index 1ad703150de..f9c03ca3b2f 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -198,15 +198,19 @@ static int setfl(int fd, struct file * filp, unsigned long arg) } static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, - uid_t uid, uid_t euid, int force) + int force) { write_lock_irq(&filp->f_owner.lock); if (force || !filp->f_owner.pid) { put_pid(filp->f_owner.pid); filp->f_owner.pid = get_pid(pid); filp->f_owner.pid_type = type; - filp->f_owner.uid = uid; - filp->f_owner.euid = euid; + + if (pid) { + const struct cred *cred = current_cred(); + filp->f_owner.uid = cred->uid; + filp->f_owner.euid = cred->euid; + } } write_unlock_irq(&filp->f_owner.lock); } @@ -214,14 +218,13 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - const struct cred *cred = current_cred(); int err; - + err = security_file_set_fowner(filp); if (err) return err; - f_modown(filp, pid, type, cred->uid, cred->euid, force); + f_modown(filp, pid, type, force); return 0; } EXPORT_SYMBOL(__f_setown); @@ -247,7 +250,7 @@ EXPORT_SYMBOL(f_setown); void f_delown(struct file *filp) { - f_modown(filp, NULL, PIDTYPE_PID, 0, 0, 1); + f_modown(filp, NULL, PIDTYPE_PID, 1); } pid_t f_getown(struct file *filp) -- cgit v1.2.3-70-g09d2 From 8eeee4e2f04fc551f50c9d9847da2d73d7d33728 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 17 Jun 2009 00:27:10 +0200 Subject: send_sigio_to_task: sanitize the usage of fown->signum send_sigio_to_task() reads fown->signum several times, we can race with F_SETSIG which changes ->signum lockless. In theory, this can fool security checks or we can call group_send_sig_info() with the wrong ->si_signo which does not match "int sig". Change the code to cache ->signum. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- fs/fcntl.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/fcntl.c b/fs/fcntl.c index f9c03ca3b2f..a040b764f8e 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -428,14 +428,20 @@ static inline int sigio_perm(struct task_struct *p, } static void send_sigio_to_task(struct task_struct *p, - struct fown_struct *fown, + struct fown_struct *fown, int fd, int reason) { - if (!sigio_perm(p, fown, fown->signum)) + /* + * F_SETSIG can change ->signum lockless in parallel, make + * sure we read it once and use the same value throughout. + */ + int signum = ACCESS_ONCE(fown->signum); + + if (!sigio_perm(p, fown, signum)) return; - switch (fown->signum) { + switch (signum) { siginfo_t si; default: /* Queue a rt signal with the appropriate fd as its @@ -444,7 +450,7 @@ static void send_sigio_to_task(struct task_struct *p, delivered even if we can't queue. Failure to queue in this case _should_ be reported; we fall back to SIGIO in that case. --sct */ - si.si_signo = fown->signum; + si.si_signo = signum; si.si_errno = 0; si.si_code = reason; /* Make sure we are called with one of the POLL_* @@ -456,7 +462,7 @@ static void send_sigio_to_task(struct task_struct *p, else si.si_band = band_table[reason - POLL_IN]; si.si_fd = fd; - if (!group_send_sig_info(fown->signum, &si, p)) + if (!group_send_sig_info(signum, &si, p)) break; /* fall-through: fall back on the old plain SIGIO signal */ case 0: -- cgit v1.2.3-70-g09d2 From 20a0307c0396c2edb651401d2f2db193dda2f3c9 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:22 -0700 Subject: mm: introduce PageHuge() for testing huge/gigantic pages A series of patches to enhance the /proc/pagemap interface and to add a userspace executable which can be used to present the pagemap data. Export 10 more flags to end users (and more for kernel developers): 11. KPF_MMAP (pseudo flag) memory mapped page 12. KPF_ANON (pseudo flag) memory mapped page (anonymous) 13. KPF_SWAPCACHE page is in swap cache 14. KPF_SWAPBACKED page is swap/RAM backed 15. KPF_COMPOUND_HEAD (*) 16. KPF_COMPOUND_TAIL (*) 17. KPF_HUGE hugeTLB pages 18. KPF_UNEVICTABLE page is in the unevictable LRU list 19. KPF_HWPOISON hardware detected corruption 20. KPF_NOPAGE (pseudo flag) no page frame at the address (*) For compound pages, exporting _both_ head/tail info enables users to tell where a compound page starts/ends, and its order. a simple demo of the page-types tool # ./page-types -h page-types [options] -r|--raw Raw mode, for kernel developers -a|--addr addr-spec Walk a range of pages -b|--bits bits-spec Walk pages with specified bits -l|--list Show page details in ranges -L|--list-each Show page details one by one -N|--no-summary Don't show summay info -h|--help Show this usage message addr-spec: N one page at offset N (unit: pages) N+M pages range from N to N+M-1 N,M pages range from N to M-1 N, pages range from N to end ,M pages range from 0 to M bits-spec: bit1,bit2 (flags & (bit1|bit2)) != 0 bit1,bit2=bit1 (flags & (bit1|bit2)) == bit1 bit1,~bit2 (flags & (bit1|bit2)) == bit1 =bit1,bit2 flags == (bit1|bit2) bit-names: locked error referenced uptodate dirty lru active slab writeback reclaim buddy mmap anonymous swapcache swapbacked compound_head compound_tail huge unevictable hwpoison nopage reserved(r) mlocked(r) mappedtodisk(r) private(r) private_2(r) owner_private(r) arch(r) uncached(r) readahead(o) slob_free(o) slub_frozen(o) slub_debug(o) (r) raw mode bits (o) overloaded bits # ./page-types flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 487369 1903 _________________________________ 0x0000000000000014 5 0 __R_D____________________________ referenced,dirty 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000000000024 34 0 __R__l___________________________ referenced,lru 0x0000000000000028 3838 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 48 0 ___U_l_______________________I___ uptodate,lru,readahead 0x000000000000002c 6478 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x0000000000000040 8344 32 ______A__________________________ active 0x0000000000000060 1 0 _____lA__________________________ lru,active 0x0000000000000068 348 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x000000000000006c 988 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 503 1 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 30 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types -r flags page-count MB symbolic-flags long-symbolic-flags 0x0000000000000000 468002 1828 _________________________________ 0x0000000100000000 19102 74 _____________________r___________ reserved 0x0000000000008000 41 0 _______________H_________________ compound_head 0x0000000000010000 188 0 ________________T________________ compound_tail 0x0000000000008014 1 0 __R_D__________H_________________ referenced,dirty,compound_head 0x0000000000010014 4 0 __R_D___________T________________ referenced,dirty,compound_tail 0x0000000000000020 1 0 _____l___________________________ lru 0x0000000800000024 34 0 __R__l__________________P________ referenced,lru,private 0x0000000000000028 3794 14 ___U_l___________________________ uptodate,lru 0x0001000000000028 46 0 ___U_l_______________________I___ uptodate,lru,readahead 0x0000000400000028 44 0 ___U_l_________________d_________ uptodate,lru,mappedtodisk 0x0001000400000028 2 0 ___U_l_________________d_____I___ uptodate,lru,mappedtodisk,readahead 0x000000000000002c 6434 25 __RU_l___________________________ referenced,uptodate,lru 0x000100000000002c 47 0 __RU_l_______________________I___ referenced,uptodate,lru,readahead 0x000000040000002c 14 0 __RU_l_________________d_________ referenced,uptodate,lru,mappedtodisk 0x000000080000002c 30 0 __RU_l__________________P________ referenced,uptodate,lru,private 0x0000000800000040 8124 31 ______A_________________P________ active,private 0x0000000000000040 219 0 ______A__________________________ active 0x0000000800000060 1 0 _____lA_________________P________ lru,active,private 0x0000000000000068 322 1 ___U_lA__________________________ uptodate,lru,active 0x0001000000000068 12 0 ___U_lA______________________I___ uptodate,lru,active,readahead 0x0000000400000068 13 0 ___U_lA________________d_________ uptodate,lru,active,mappedtodisk 0x0000000800000068 12 0 ___U_lA_________________P________ uptodate,lru,active,private 0x000000000000006c 977 3 __RU_lA__________________________ referenced,uptodate,lru,active 0x000100000000006c 48 0 __RU_lA______________________I___ referenced,uptodate,lru,active,readahead 0x000000040000006c 5 0 __RU_lA________________d_________ referenced,uptodate,lru,active,mappedtodisk 0x000000080000006c 3 0 __RU_lA_________________P________ referenced,uptodate,lru,active,private 0x0000000c0000006c 3 0 __RU_lA________________dP________ referenced,uptodate,lru,active,mappedtodisk,private 0x0000000c00000068 1 0 ___U_lA________________dP________ uptodate,lru,active,mappedtodisk,private 0x0000000000004078 1 0 ___UDlA_______b__________________ uptodate,dirty,lru,active,swapbacked 0x000000000000407c 34 0 __RUDlA_______b__________________ referenced,uptodate,dirty,lru,active,swapbacked 0x0000000000000400 538 2 __________B______________________ buddy 0x0000000000000804 1 0 __R________M_____________________ referenced,mmap 0x0000000000000828 1029 4 ___U_l_____M_____________________ uptodate,lru,mmap 0x0001000000000828 43 0 ___U_l_____M_________________I___ uptodate,lru,mmap,readahead 0x000000000000082c 382 1 __RU_l_____M_____________________ referenced,uptodate,lru,mmap 0x000100000000082c 12 0 __RU_l_____M_________________I___ referenced,uptodate,lru,mmap,readahead 0x0000000000000868 192 0 ___U_lA____M_____________________ uptodate,lru,active,mmap 0x0001000000000868 12 0 ___U_lA____M_________________I___ uptodate,lru,active,mmap,readahead 0x000000000000086c 800 3 __RU_lA____M_____________________ referenced,uptodate,lru,active,mmap 0x000100000000086c 31 0 __RU_lA____M_________________I___ referenced,uptodate,lru,active,mmap,readahead 0x0000000000004878 2 0 ___UDlA____M__b__________________ uptodate,dirty,lru,active,mmap,swapbacked 0x0000000000001000 492 1 ____________a____________________ anonymous 0x0000000000005008 2 0 ___U________a_b__________________ uptodate,anonymous,swapbacked 0x0000000000005808 4 0 ___U_______Ma_b__________________ uptodate,mmap,anonymous,swapbacked 0x000000000000580c 1 0 __RU_______Ma_b__________________ referenced,uptodate,mmap,anonymous,swapbacked 0x0000000000005868 2839 11 ___U_lA____Ma_b__________________ uptodate,lru,active,mmap,anonymous,swapbacked 0x000000000000586c 29 0 __RU_lA____Ma_b__________________ referenced,uptodate,lru,active,mmap,anonymous,swapbacked total 513968 2007 # ./page-types --raw --list --no-summary --bits reserved offset count flags 0 15 _____________________r___________ 31 4 _____________________r___________ 159 97 _____________________r___________ 4096 2067 _____________________r___________ 6752 2390 _____________________r___________ 9355 3 _____________________r___________ 9728 14526 _____________________r___________ This patch: Introduce PageHuge(), which identifies huge/gigantic pages by their dedicated compound destructor functions. Also move prep_compound_gigantic_page() to hugetlb.c and make __free_pages_ok() non-static. Signed-off-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Andi Kleen Cc: Matt Mackall Cc: Alexey Dobriyan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 1 + include/linux/hugetlb.h | 7 ++++ mm/hugetlb.c | 98 +++++++++++++++++++++++++++++++------------------ mm/internal.h | 5 +-- mm/page_alloc.c | 17 --------- 5 files changed, 73 insertions(+), 55 deletions(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index e9983837d08..38dd88b7ce8 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -6,6 +6,7 @@ #include #include #include +#include #include #include "internal.h" diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 03be7f29ca0..a05a5ef3339 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -11,6 +11,8 @@ struct ctl_table; +int PageHuge(struct page *page); + static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { return vma->vm_flags & VM_HUGETLB; @@ -61,6 +63,11 @@ void hugetlb_change_protection(struct vm_area_struct *vma, #else /* !CONFIG_HUGETLB_PAGE */ +static inline int PageHuge(struct page *page) +{ + return 0; +} + static inline int is_vm_hugetlb_page(struct vm_area_struct *vma) { return 0; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7b9b6015b2e..a56e6f3ce97 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -578,41 +578,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -623,6 +588,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -1140,6 +1133,41 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + static int prev_nid; + int nid = prev_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + do { + nid = next_node(nid, node_online_map); + if (nid == MAX_NUMNODES) + nid = first_node(node_online_map); + + /* To shrink on this node, there must be a surplus page */ + if (delta < 0 && !h->surplus_huge_pages_node[nid]) + continue; + /* Surplus cannot exceed the total number of pages */ + if (delta > 0 && h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (nid != prev_nid); + + prev_nid = nid; + return ret; +} + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { diff --git a/mm/internal.h b/mm/internal.h index 4b1672a8cf7..b4ac332e807 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -16,9 +16,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); -extern void prep_compound_page(struct page *page, unsigned long order); -extern void prep_compound_gigantic_page(struct page *page, unsigned long order); - static inline void set_page_count(struct page *page, int v) { atomic_set(&page->_count, v); @@ -51,6 +48,8 @@ extern void putback_lru_page(struct page *page); */ extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); +extern void prep_compound_page(struct page *page, unsigned long order); + /* * function for dealing with page's order in buddy system. diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ca06d87dc1..131655cdb6b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -300,23 +300,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; -- cgit v1.2.3-70-g09d2 From ed7ce0f1022942301776f93159c981b09382ddea Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:23 -0700 Subject: proc: kpagecount/kpageflags code cleanup Move increments of pfn/out to bottom of the loop. Signed-off-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Andi Kleen Acked-by: Matt Mackall Cc: Alexey Dobriyan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 38dd88b7ce8..e73e911b7d0 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -12,6 +12,7 @@ #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) + /* /proc/kpagecount - an array exposing page counts * * Each entry is a u64 representing the corresponding @@ -33,20 +34,22 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - ppage = NULL; if (pfn_valid(pfn)) ppage = pfn_to_page(pfn); - pfn++; + else + ppage = NULL; if (!ppage) pcount = 0; else pcount = page_mapcount(ppage); - if (put_user(pcount, out++)) { + if (put_user(pcount, out)) { ret = -EFAULT; break; } + pfn++; + out++; count -= KPMSIZE; } @@ -99,10 +102,10 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, return -EINVAL; while (count > 0) { - ppage = NULL; if (pfn_valid(pfn)) ppage = pfn_to_page(pfn); - pfn++; + else + ppage = NULL; if (!ppage) kflags = 0; else @@ -120,11 +123,13 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); - if (put_user(uflags, out++)) { + if (put_user(uflags, out)) { ret = -EFAULT; break; } + pfn++; + out++; count -= KPMSIZE; } -- cgit v1.2.3-70-g09d2 From 177975495914efb372f7edee28ba9a0fdb754149 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:32:24 -0700 Subject: proc: export more page flags in /proc/kpageflags Export all page flags faithfully in /proc/kpageflags. 11. KPF_MMAP (pseudo flag) memory mapped page 12. KPF_ANON (pseudo flag) memory mapped page (anonymous) 13. KPF_SWAPCACHE page is in swap cache 14. KPF_SWAPBACKED page is swap/RAM backed 15. KPF_COMPOUND_HEAD (*) 16. KPF_COMPOUND_TAIL (*) 17. KPF_HUGE hugeTLB pages 18. KPF_UNEVICTABLE page is in the unevictable LRU list 19. KPF_HWPOISON(TBD) hardware detected corruption 20. KPF_NOPAGE (pseudo flag) no page frame at the address 32-39. more obscure flags for kernel developers (*) For compound pages, exporting _both_ head/tail info enables users to tell where a compound page starts/ends, and its order. The accompanying page-types tool will handle the details like decoupling overloaded flags and hiding obscure flags to normal users. Thanks to KOSAKI and Andi for their valuable recommendations! Signed-off-by: Wu Fengguang Cc: KOSAKI Motohiro Cc: Andi Kleen Cc: Matt Mackall Cc: Alexey Dobriyan Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 152 +++++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 120 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index e73e911b7d0..9d926bd279a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -72,19 +72,124 @@ static const struct file_operations proc_kpagecount_operations = { /* These macros are used to decouple internal flags from exported ones */ -#define KPF_LOCKED 0 -#define KPF_ERROR 1 -#define KPF_REFERENCED 2 -#define KPF_UPTODATE 3 -#define KPF_DIRTY 4 -#define KPF_LRU 5 -#define KPF_ACTIVE 6 -#define KPF_SLAB 7 -#define KPF_WRITEBACK 8 -#define KPF_RECLAIM 9 -#define KPF_BUDDY 10 - -#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos) +#define KPF_LOCKED 0 +#define KPF_ERROR 1 +#define KPF_REFERENCED 2 +#define KPF_UPTODATE 3 +#define KPF_DIRTY 4 +#define KPF_LRU 5 +#define KPF_ACTIVE 6 +#define KPF_SLAB 7 +#define KPF_WRITEBACK 8 +#define KPF_RECLAIM 9 +#define KPF_BUDDY 10 + +/* 11-20: new additions in 2.6.31 */ +#define KPF_MMAP 11 +#define KPF_ANON 12 +#define KPF_SWAPCACHE 13 +#define KPF_SWAPBACKED 14 +#define KPF_COMPOUND_HEAD 15 +#define KPF_COMPOUND_TAIL 16 +#define KPF_HUGE 17 +#define KPF_UNEVICTABLE 18 +#define KPF_NOPAGE 20 + +/* kernel hacking assistances + * WARNING: subject to change, never rely on them! + */ +#define KPF_RESERVED 32 +#define KPF_MLOCKED 33 +#define KPF_MAPPEDTODISK 34 +#define KPF_PRIVATE 35 +#define KPF_PRIVATE_2 36 +#define KPF_OWNER_PRIVATE 37 +#define KPF_ARCH 38 +#define KPF_UNCACHED 39 + +static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) +{ + return ((kflags >> kbit) & 1) << ubit; +} + +static u64 get_uflags(struct page *page) +{ + u64 k; + u64 u; + + /* + * pseudo flag: KPF_NOPAGE + * it differentiates a memory hole from a page with no flags + */ + if (!page) + return 1 << KPF_NOPAGE; + + k = page->flags; + u = 0; + + /* + * pseudo flags for the well known (anonymous) memory mapped pages + * + * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the + * simple test in page_mapped() is not enough. + */ + if (!PageSlab(page) && page_mapped(page)) + u |= 1 << KPF_MMAP; + if (PageAnon(page)) + u |= 1 << KPF_ANON; + + /* + * compound pages: export both head/tail info + * they together define a compound page's start/end pos and order + */ + if (PageHead(page)) + u |= 1 << KPF_COMPOUND_HEAD; + if (PageTail(page)) + u |= 1 << KPF_COMPOUND_TAIL; + if (PageHuge(page)) + u |= 1 << KPF_HUGE; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + + /* + * Caveats on high order pages: + * PG_buddy will only be set on the head page; SLUB/SLQB do the same + * for PG_slab; SLOB won't set PG_slab at all on compound pages. + */ + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); + u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); + + u |= kpf_copy_bit(k, KPF_ERROR, PG_error); + u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); + u |= kpf_copy_bit(k, KPF_UPTODATE, PG_uptodate); + u |= kpf_copy_bit(k, KPF_WRITEBACK, PG_writeback); + + u |= kpf_copy_bit(k, KPF_LRU, PG_lru); + u |= kpf_copy_bit(k, KPF_REFERENCED, PG_referenced); + u |= kpf_copy_bit(k, KPF_ACTIVE, PG_active); + u |= kpf_copy_bit(k, KPF_RECLAIM, PG_reclaim); + + u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); + u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); + +#ifdef CONFIG_UNEVICTABLE_LRU + u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); + u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); +#endif + +#ifdef CONFIG_IA64_UNCACHED_ALLOCATOR + u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); +#endif + + u |= kpf_copy_bit(k, KPF_RESERVED, PG_reserved); + u |= kpf_copy_bit(k, KPF_MAPPEDTODISK, PG_mappedtodisk); + u |= kpf_copy_bit(k, KPF_PRIVATE, PG_private); + u |= kpf_copy_bit(k, KPF_PRIVATE_2, PG_private_2); + u |= kpf_copy_bit(k, KPF_OWNER_PRIVATE, PG_owner_priv_1); + u |= kpf_copy_bit(k, KPF_ARCH, PG_arch_1); + + return u; +}; static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) @@ -94,7 +199,6 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; - u64 kflags, uflags; pfn = src / KPMSIZE; count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src); @@ -106,24 +210,8 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, ppage = pfn_to_page(pfn); else ppage = NULL; - if (!ppage) - kflags = 0; - else - kflags = ppage->flags; - - uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) | - kpf_copy_bit(kflags, KPF_ERROR, PG_error) | - kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | - kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | - kpf_copy_bit(kflags, KPF_DIRTY, PG_dirty) | - kpf_copy_bit(kflags, KPF_LRU, PG_lru) | - kpf_copy_bit(kflags, KPF_ACTIVE, PG_active) | - kpf_copy_bit(kflags, KPF_SLAB, PG_slab) | - kpf_copy_bit(kflags, KPF_WRITEBACK, PG_writeback) | - kpf_copy_bit(kflags, KPF_RECLAIM, PG_reclaim) | - kpf_copy_bit(kflags, KPF_BUDDY, PG_buddy); - - if (put_user(uflags, out)) { + + if (put_user(get_uflags(ppage), out)) { ret = -EFAULT; break; } -- cgit v1.2.3-70-g09d2 From 6837765963f1723e80ca97b1fae660f3a60d77df Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 16 Jun 2009 15:32:51 -0700 Subject: mm: remove CONFIG_UNEVICTABLE_LRU config option Currently, nobody wants to turn UNEVICTABLE_LRU off. Thus this configurability is unnecessary. Signed-off-by: KOSAKI Motohiro Cc: Johannes Weiner Cc: Andi Kleen Acked-by: Minchan Kim Cc: David Woodhouse Cc: Matt Mackall Cc: Rik van Riel Cc: Lee Schermerhorn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/base/node.c | 4 ---- fs/proc/meminfo.c | 4 ---- fs/proc/page.c | 2 -- include/linux/mmzone.h | 13 ------------- include/linux/page-flags.h | 16 +--------------- include/linux/pagemap.h | 12 ------------ include/linux/rmap.h | 7 ------- include/linux/swap.h | 19 ------------------- include/linux/vmstat.h | 2 -- kernel/sysctl.c | 2 -- mm/Kconfig | 14 +------------- mm/internal.h | 6 ------ mm/mlock.c | 22 ---------------------- mm/page_alloc.c | 9 --------- mm/rmap.c | 3 +-- mm/vmscan.c | 17 ----------------- mm/vmstat.c | 4 ---- 17 files changed, 3 insertions(+), 153 deletions(-) (limited to 'fs') diff --git a/drivers/base/node.c b/drivers/base/node.c index 40b809742a1..91d4087b403 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -72,10 +72,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, "Node %d Inactive(anon): %8lu kB\n" "Node %d Active(file): %8lu kB\n" "Node %d Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Node %d Unevictable: %8lu kB\n" "Node %d Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "Node %d HighTotal: %8lu kB\n" "Node %d HighFree: %8lu kB\n" @@ -105,10 +103,8 @@ static ssize_t node_read_meminfo(struct sys_device * dev, nid, K(node_page_state(nid, NR_INACTIVE_ANON)), nid, K(node_page_state(nid, NR_ACTIVE_FILE)), nid, K(node_page_state(nid, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU nid, K(node_page_state(nid, NR_UNEVICTABLE)), nid, K(node_page_state(nid, NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM nid, K(i.totalhigh), nid, K(i.freehigh), diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index c6b0302af4c..d5c410d47fa 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -64,10 +64,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "Inactive(anon): %8lu kB\n" "Active(file): %8lu kB\n" "Inactive(file): %8lu kB\n" -#ifdef CONFIG_UNEVICTABLE_LRU "Unevictable: %8lu kB\n" "Mlocked: %8lu kB\n" -#endif #ifdef CONFIG_HIGHMEM "HighTotal: %8lu kB\n" "HighFree: %8lu kB\n" @@ -109,10 +107,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(pages[LRU_INACTIVE_ANON]), K(pages[LRU_ACTIVE_FILE]), K(pages[LRU_INACTIVE_FILE]), -#ifdef CONFIG_UNEVICTABLE_LRU K(pages[LRU_UNEVICTABLE]), K(global_page_state(NR_MLOCK)), -#endif #ifdef CONFIG_HIGHMEM K(i.totalhigh), K(i.freehigh), diff --git a/fs/proc/page.c b/fs/proc/page.c index 9d926bd279a..2707c6c7a20 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -172,10 +172,8 @@ static u64 get_uflags(struct page *page) u |= kpf_copy_bit(k, KPF_SWAPCACHE, PG_swapcache); u |= kpf_copy_bit(k, KPF_SWAPBACKED, PG_swapbacked); -#ifdef CONFIG_UNEVICTABLE_LRU u |= kpf_copy_bit(k, KPF_UNEVICTABLE, PG_unevictable); u |= kpf_copy_bit(k, KPF_MLOCKED, PG_mlocked); -#endif #ifdef CONFIG_IA64_UNCACHED_ALLOCATOR u |= kpf_copy_bit(k, KPF_UNCACHED, PG_uncached); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index db976b9f879..88959853737 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -83,13 +83,8 @@ enum zone_stat_item { NR_ACTIVE_ANON, /* " " " " " */ NR_INACTIVE_FILE, /* " " " " " */ NR_ACTIVE_FILE, /* " " " " " */ -#ifdef CONFIG_UNEVICTABLE_LRU NR_UNEVICTABLE, /* " " " " " */ NR_MLOCK, /* mlock()ed pages found and moved off LRU */ -#else - NR_UNEVICTABLE = NR_ACTIVE_FILE, /* avoid compiler errors in dead code */ - NR_MLOCK = NR_ACTIVE_FILE, -#endif NR_ANON_PAGES, /* Mapped anonymous pages */ NR_FILE_MAPPED, /* pagecache pages mapped into pagetables. only modified from process context */ @@ -132,11 +127,7 @@ enum lru_list { LRU_ACTIVE_ANON = LRU_BASE + LRU_ACTIVE, LRU_INACTIVE_FILE = LRU_BASE + LRU_FILE, LRU_ACTIVE_FILE = LRU_BASE + LRU_FILE + LRU_ACTIVE, -#ifdef CONFIG_UNEVICTABLE_LRU LRU_UNEVICTABLE, -#else - LRU_UNEVICTABLE = LRU_ACTIVE_FILE, /* avoid compiler errors in dead code */ -#endif NR_LRU_LISTS }; @@ -156,11 +147,7 @@ static inline int is_active_lru(enum lru_list l) static inline int is_unevictable_lru(enum lru_list l) { -#ifdef CONFIG_UNEVICTABLE_LRU return (l == LRU_UNEVICTABLE); -#else - return 0; -#endif } enum zone_watermarks { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 62214c7d2d9..d6792f88a17 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -95,9 +95,7 @@ enum pageflags { PG_reclaim, /* To be reclaimed asap */ PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ -#ifdef CONFIG_UNEVICTABLE_LRU PG_unevictable, /* Page is "unevictable" */ -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT PG_mlocked, /* Page is vma mlocked */ #endif @@ -248,14 +246,8 @@ PAGEFLAG_FALSE(SwapCache) SETPAGEFLAG_NOOP(SwapCache) CLEARPAGEFLAG_NOOP(SwapCache) #endif -#ifdef CONFIG_UNEVICTABLE_LRU PAGEFLAG(Unevictable, unevictable) __CLEARPAGEFLAG(Unevictable, unevictable) TESTCLEARFLAG(Unevictable, unevictable) -#else -PAGEFLAG_FALSE(Unevictable) TESTCLEARFLAG_FALSE(Unevictable) - SETPAGEFLAG_NOOP(Unevictable) CLEARPAGEFLAG_NOOP(Unevictable) - __CLEARPAGEFLAG_NOOP(Unevictable) -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define MLOCK_PAGES 1 @@ -382,12 +374,6 @@ static inline void __ClearPageTail(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ -#ifdef CONFIG_UNEVICTABLE_LRU -#define __PG_UNEVICTABLE (1 << PG_unevictable) -#else -#define __PG_UNEVICTABLE 0 -#endif - #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT #define __PG_MLOCKED (1 << PG_mlocked) #else @@ -403,7 +389,7 @@ static inline void __ClearPageTail(struct page *page) 1 << PG_private | 1 << PG_private_2 | \ 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ - __PG_UNEVICTABLE | __PG_MLOCKED) + 1 << PG_unevictable | __PG_MLOCKED) /* * Flags checked when a page is prepped for return by the page allocator. diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 34da5230faa..aec3252afcf 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -22,9 +22,7 @@ enum mapping_flags { AS_EIO = __GFP_BITS_SHIFT + 0, /* IO error on async write */ AS_ENOSPC = __GFP_BITS_SHIFT + 1, /* ENOSPC on async write */ AS_MM_ALL_LOCKS = __GFP_BITS_SHIFT + 2, /* under mm_take_all_locks() */ -#ifdef CONFIG_UNEVICTABLE_LRU AS_UNEVICTABLE = __GFP_BITS_SHIFT + 3, /* e.g., ramdisk, SHM_LOCK */ -#endif }; static inline void mapping_set_error(struct address_space *mapping, int error) @@ -37,8 +35,6 @@ static inline void mapping_set_error(struct address_space *mapping, int error) } } -#ifdef CONFIG_UNEVICTABLE_LRU - static inline void mapping_set_unevictable(struct address_space *mapping) { set_bit(AS_UNEVICTABLE, &mapping->flags); @@ -55,14 +51,6 @@ static inline int mapping_unevictable(struct address_space *mapping) return test_bit(AS_UNEVICTABLE, &mapping->flags); return !!mapping; } -#else -static inline void mapping_set_unevictable(struct address_space *mapping) { } -static inline void mapping_clear_unevictable(struct address_space *mapping) { } -static inline int mapping_unevictable(struct address_space *mapping) -{ - return 0; -} -#endif static inline gfp_t mapping_gfp_mask(struct address_space * mapping) { diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b35bc0e19cd..619379a1dd9 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -105,18 +105,11 @@ unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); */ int page_mkclean(struct page *); -#ifdef CONFIG_UNEVICTABLE_LRU /* * called in munlock()/munmap() path to check for other vmas holding * the page mlocked. */ int try_to_munlock(struct page *); -#else -static inline int try_to_munlock(struct page *page) -{ - return 0; /* a.k.a. SWAP_SUCCESS */ -} -#endif #else /* !CONFIG_MMU */ diff --git a/include/linux/swap.h b/include/linux/swap.h index d476aad3ff5..f30c06908f0 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -235,7 +235,6 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU extern int page_evictable(struct page *page, struct vm_area_struct *vma); extern void scan_mapping_unevictable_pages(struct address_space *); @@ -244,24 +243,6 @@ extern int scan_unevictable_handler(struct ctl_table *, int, struct file *, void __user *, size_t *, loff_t *); extern int scan_unevictable_register_node(struct node *node); extern void scan_unevictable_unregister_node(struct node *node); -#else -static inline int page_evictable(struct page *page, - struct vm_area_struct *vma) -{ - return 1; -} - -static inline void scan_mapping_unevictable_pages(struct address_space *mapping) -{ -} - -static inline int scan_unevictable_register_node(struct node *node) -{ - return 0; -} - -static inline void scan_unevictable_unregister_node(struct node *node) { } -#endif extern int kswapd_run(int nid); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 524cd1b28ec..ff4696c6dce 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -41,7 +41,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, #ifdef CONFIG_HUGETLB_PAGE HTLB_BUDDY_PGALLOC, HTLB_BUDDY_PGALLOC_FAIL, #endif -#ifdef CONFIG_UNEVICTABLE_LRU UNEVICTABLE_PGCULLED, /* culled to noreclaim list */ UNEVICTABLE_PGSCANNED, /* scanned for reclaimability */ UNEVICTABLE_PGRESCUED, /* rescued from noreclaim list */ @@ -50,7 +49,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, UNEVICTABLE_PGCLEARED, /* on COW, page truncate */ UNEVICTABLE_PGSTRANDED, /* unable to isolate on unlock */ UNEVICTABLE_MLOCKFREED, -#endif NR_VM_EVENT_ITEMS }; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 0e51a35a448..2ccee08f92f 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1325,7 +1325,6 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif -#ifdef CONFIG_UNEVICTABLE_LRU { .ctl_name = CTL_UNNUMBERED, .procname = "scan_unevictable_pages", @@ -1334,7 +1333,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = &scan_unevictable_handler, }, -#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt diff --git a/mm/Kconfig b/mm/Kconfig index 71830ba7b98..97d2c88b745 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -203,25 +203,13 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config UNEVICTABLE_LRU - bool "Add LRU list to track non-evictable pages" - default y - help - Keeps unevictable pages off of the active and inactive pageout - lists, so kswapd will not waste CPU time or have its balancing - algorithms thrown off by scanning these pages. Selecting this - will use one page flag and increase the code size a little, - say Y unless you know what you are doing. - - See Documentation/vm/unevictable-lru.txt for more information. - config HAVE_MLOCK bool default y if MMU=y config HAVE_MLOCKED_PAGE_BIT bool - default y if HAVE_MLOCK=y && UNEVICTABLE_LRU=y + default y if HAVE_MLOCK=y config MMU_NOTIFIER bool diff --git a/mm/internal.h b/mm/internal.h index b4ac332e807..f02c7508068 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -73,7 +73,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * unevictable_migrate_page() called only from migrate_page_copy() to * migrate unevictable flag to new page. @@ -85,11 +84,6 @@ static inline void unevictable_migrate_page(struct page *new, struct page *old) if (TestClearPageUnevictable(old)) SetPageUnevictable(new); } -#else -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ -} -#endif #ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* diff --git a/mm/mlock.c b/mm/mlock.c index ac130433c7d..45eb650b965 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -31,7 +31,6 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); -#ifdef CONFIG_UNEVICTABLE_LRU /* * Mlocked pages are marked with PageMlocked() flag for efficient testing * in vmscan and, possibly, the fault path; and to support semi-accurate @@ -261,27 +260,6 @@ static int __mlock_posix_error_return(long retval) return retval; } -#else /* CONFIG_UNEVICTABLE_LRU */ - -/* - * Just make pages present if VM_LOCKED. No-op if unlocking. - */ -static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) -{ - if (mlock && (vma->vm_flags & VM_LOCKED)) - return make_pages_present(start, end); - return 0; -} - -static inline int __mlock_posix_error_return(long retval) -{ - return 0; -} - -#endif /* CONFIG_UNEVICTABLE_LRU */ - /** * mlock_vma_pages_range() - mlock pages in specified vma range. * @vma - the vma containing the specfied address range diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 00e293734fc..c95a77cd581 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2077,19 +2077,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -2113,9 +2108,7 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2129,9 +2122,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") diff --git a/mm/rmap.c b/mm/rmap.c index 23122af3261..316c9d6930a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1202,7 +1202,6 @@ int try_to_unmap(struct page *page, int migration) return ret; } -#ifdef CONFIG_UNEVICTABLE_LRU /** * try_to_munlock - try to munlock a page * @page: the page to be munlocked @@ -1226,4 +1225,4 @@ int try_to_munlock(struct page *page) else return try_to_unmap_file(page, 1, 0); } -#endif + diff --git a/mm/vmscan.c b/mm/vmscan.c index 879d034930c..2c4b945b011 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -514,7 +514,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -568,20 +567,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -2470,7 +2455,6 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2717,4 +2701,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 1e151cf6bf8..1e3aa8139f2 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -629,10 +629,8 @@ static const char * const vmstat_text[] = { "nr_active_anon", "nr_inactive_file", "nr_active_file", -#ifdef CONFIG_UNEVICTABLE_LRU "nr_unevictable", "nr_mlock", -#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -687,7 +685,6 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif -#ifdef CONFIG_UNEVICTABLE_LRU "unevictable_pgs_culled", "unevictable_pgs_scanned", "unevictable_pgs_rescued", @@ -697,7 +694,6 @@ static const char * const vmstat_text[] = { "unevictable_pgs_stranded", "unevictable_pgs_mlockfreed", #endif -#endif }; static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, -- cgit v1.2.3-70-g09d2 From 2ff05b2b4eac2e63d345fc731ea151a060247f53 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 16 Jun 2009 15:32:56 -0700 Subject: oom: move oom_adj value from task_struct to mm_struct The per-task oom_adj value is a characteristic of its mm more than the task itself since it's not possible to oom kill any thread that shares the mm. If a task were to be killed while attached to an mm that could not be freed because another thread were set to OOM_DISABLE, it would have needlessly been terminated since there is no potential for future memory freeing. This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct mm_struct. This requires task_lock() on a task to check its oom_adj value to protect against exec, but it's already necessary to take the lock when dereferencing the mm to find the total VM size for the badness heuristic. This fixes a livelock if the oom killer chooses a task and another thread sharing the same memory has an oom_adj value of OOM_DISABLE. This occurs because oom_kill_task() repeatedly returns 1 and refuses to kill the chosen task while select_bad_process() will repeatedly choose the same task during the next retry. Taking task_lock() in select_bad_process() to check for OOM_DISABLE and in oom_kill_task() to check for threads sharing the same memory will be removed in the next patch in this series where it will no longer be necessary. Writing to /proc/pid/oom_adj for a kthread will now return -EINVAL since these threads are immune from oom killing already. They simply report an oom_adj value of OOM_DISABLE. Cc: Nick Piggin Cc: Rik van Riel Cc: Mel Gorman Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 15 ++++++++++----- fs/proc/base.c | 19 ++++++++++++++++--- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 1 - mm/oom_kill.c | 34 ++++++++++++++++++++++------------ 5 files changed, 50 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index cd8717a3627..ebff3c10a07 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1003,11 +1003,13 @@ CHAPTER 3: PER-PROCESS PARAMETERS 3.1 /proc//oom_adj - Adjust the oom-killer score ------------------------------------------------------ -This file can be used to adjust the score used to select which processes -should be killed in an out-of-memory situation. Giving it a high score will -increase the likelihood of this process being killed by the oom-killer. Valid -values are in the range -16 to +15, plus the special value -17, which disables -oom-killing altogether for this process. +This file can be used to adjust the score used to select which processes should +be killed in an out-of-memory situation. The oom_adj value is a characteristic +of the task's mm, so all threads that share an mm with pid will have the same +oom_adj value. A high value will increase the likelihood of this process being +killed by the oom-killer. Valid values are in the range -16 to +15 as +explained below and a special value of -17, which disables oom-killing +altogether for threads sharing pid's mm. The process to be killed in an out-of-memory situation is selected among all others based on its badness score. This value equals the original memory size of the process @@ -1021,6 +1023,9 @@ the parent's score if they do not share the same memory. Thus forking servers are the prime candidates to be killed. Having only one 'hungry' child will make parent less preferable than the child. +/proc//oom_adj cannot be changed for kthreads since they are immune from +oom-killing already. + /proc//oom_score shows process' current badness score. The following heuristics are then applied: diff --git a/fs/proc/base.c b/fs/proc/base.c index 1539e630c47..3ce5ae9e3d2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1006,7 +1006,12 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - oom_adjust = task->oomkilladj; + task_lock(task); + if (task->mm) + oom_adjust = task->mm->oom_adj; + else + oom_adjust = OOM_DISABLE; + task_unlock(task); put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1035,11 +1040,19 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { + task_lock(task); + if (!task->mm) { + task_unlock(task); + put_task_struct(task); + return -EINVAL; + } + if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { + task_unlock(task); put_task_struct(task); return -EACCES; } - task->oomkilladj = oom_adjust; + task->mm->oom_adj = oom_adjust; + task_unlock(task); put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0e80e26ecf2..f4408106fcb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -232,6 +232,8 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ + s8 oom_adj; /* OOM kill score adjustment (bit shift) */ + cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 1048bf50540..1bc6fae0c13 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1178,7 +1178,6 @@ struct task_struct { * a short time */ unsigned char fpu_counter; - s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a7b2460e922..b60913520ef 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,6 +58,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj; task_lock(p); mm = p->mm; @@ -65,6 +66,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } + oom_adj = mm->oom_adj; /* * The memory size of the process is the basis for the badness. @@ -148,15 +150,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -251,8 +253,12 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) + task_lock(p); + if (p->mm && p->mm->oom_adj == OOM_DISABLE) { + task_unlock(p); continue; + } + task_unlock(p); points = badness(p, uptime.tv_sec); if (points > *ppoints || !chosen) { @@ -304,8 +310,7 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, - p->comm); + get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -367,8 +372,12 @@ static int oom_kill_task(struct task_struct *p) * Don't kill the process if any threads are set to OOM_DISABLE */ do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + task_lock(q); + if (q->mm == mm && q->mm && q->mm->oom_adj == OOM_DISABLE) { + task_unlock(q); return 1; + } + task_unlock(q); } while_each_thread(g, q); __oom_kill_task(p, 1); @@ -393,10 +402,11 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); task_lock(current); + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->mm ? current->mm->oom_adj : OOM_DISABLE); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); -- cgit v1.2.3-70-g09d2 From 286973552f051404abdb58dd9b2f8f7558efe4e5 Mon Sep 17 00:00:00 2001 From: Mike Waychison Date: Tue, 16 Jun 2009 15:32:59 -0700 Subject: mm: remove __invalidate_mapping_pages variant Remove __invalidate_mapping_pages atomic variant now that its sole caller can sleep (fixed in eccb95cee4f0d56faa46ef22fb94dd4a3578d3eb ("vfs: fix lock inversion in drop_pagecache_sb()")). This fixes softlockups that can occur while in the drop_caches path. Signed-off-by: Mike Waychison Cc: Jan Kara Cc: Wu Fengguang Cc: Dave Chinner Cc: Nick Piggin Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/drop_caches.c | 2 +- include/linux/fs.h | 3 --- mm/truncate.c | 39 ++++++++++++++++----------------------- 3 files changed, 17 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/drop_caches.c b/fs/drop_caches.c index b6a719a909f..a2edb791344 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -24,7 +24,7 @@ static void drop_pagecache_sb(struct super_block *sb) continue; __iget(inode); spin_unlock(&inode_lock); - __invalidate_mapping_pages(inode->i_mapping, 0, -1, true); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; spin_lock(&inode_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 8146e0264ef..f5ae9f19b8a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2036,9 +2036,6 @@ extern int __invalidate_device(struct block_device *); extern int invalidate_partition(struct gendisk *, int); #endif extern int invalidate_inodes(struct super_block *); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, - bool be_atomic); unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end); diff --git a/mm/truncate.c b/mm/truncate.c index 12e1579f916..ccc3ecf7cb9 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -267,8 +267,21 @@ void truncate_inode_pages(struct address_space *mapping, loff_t lstart) } EXPORT_SYMBOL(truncate_inode_pages); -unsigned long __invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end, bool be_atomic) +/** + * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode + * @mapping: the address_space which holds the pages to invalidate + * @start: the offset 'from' which to invalidate + * @end: the offset 'to' which to invalidate (inclusive) + * + * This function only removes the unlocked pages, if you want to + * remove all the pages of one inode, you must call truncate_inode_pages. + * + * invalidate_mapping_pages() will not block on IO activity. It will not + * invalidate pages which are dirty, locked, under writeback or mapped into + * pagetables. + */ +unsigned long invalidate_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t end) { struct pagevec pvec; pgoff_t next = start; @@ -309,30 +322,10 @@ unlock: break; } pagevec_release(&pvec); - if (likely(!be_atomic)) - cond_resched(); + cond_resched(); } return ret; } - -/** - * invalidate_mapping_pages - Invalidate all the unlocked pages of one inode - * @mapping: the address_space which holds the pages to invalidate - * @start: the offset 'from' which to invalidate - * @end: the offset 'to' which to invalidate (inclusive) - * - * This function only removes the unlocked pages, if you want to - * remove all the pages of one inode, you must call truncate_inode_pages. - * - * invalidate_mapping_pages() will not block on IO activity. It will not - * invalidate pages which are dirty, locked, under writeback or mapped into - * pagetables. - */ -unsigned long invalidate_mapping_pages(struct address_space *mapping, - pgoff_t start, pgoff_t end) -{ - return __invalidate_mapping_pages(mapping, start, end, false); -} EXPORT_SYMBOL(invalidate_mapping_pages); /* -- cgit v1.2.3-70-g09d2 From 84a892456046921a40646114deed65e2df93a1bc Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Tue, 16 Jun 2009 15:33:17 -0700 Subject: writeback: skip new or to-be-freed inodes 1) I_FREEING tests should be coupled with I_CLEAR The two I_FREEING tests are racy because clear_inode() can set i_state to I_CLEAR between the clear of I_SYNC and the test of I_FREEING. 2) skip I_WILL_FREE inodes in generic_sync_sb_inodes() to avoid possible races with generic_forget_inode() generic_forget_inode() sets I_WILL_FREE call writeback on its own, so generic_sync_sb_inodes() shall not try to step in and create possible races: generic_forget_inode inode->i_state |= I_WILL_FREE; spin_unlock(&inode_lock); generic_sync_sb_inodes() spin_lock(&inode_lock); __iget(inode); __writeback_single_inode // see non zero i_count may WARN here ==> WARN_ON(inode->i_state & I_WILL_FREE); spin_unlock(&inode_lock); may call generic_forget_inode again ==> iput(inode); The above race and warning didn't turn up because writeback_inodes() holds the s_umount lock, so generic_forget_inode() finds MS_ACTIVE and returns early. But we are not sure the UBIFS calls and future callers will guarantee that. So skip I_WILL_FREE inodes for the sake of safety. Cc: Eric Sandeen Acked-by: Jeff Layton Cc: Masayoshi MIZUMA Signed-off-by: Wu Fengguang Cc: Artem Bityutskiy Cc: Christoph Hellwig Acked-by: Jan Kara Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 40308e98c6a..caf049146ca 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -321,7 +321,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) spin_lock(&inode_lock); inode->i_state &= ~I_SYNC; - if (!(inode->i_state & I_FREEING)) { + if (!(inode->i_state & (I_FREEING | I_CLEAR))) { if (!(inode->i_state & I_DIRTY) && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { /* @@ -492,7 +492,7 @@ void generic_sync_sb_inodes(struct super_block *sb, break; } - if (inode->i_state & I_NEW) { + if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; } @@ -523,7 +523,7 @@ void generic_sync_sb_inodes(struct super_block *sb, if (current_is_pdflush() && !writeback_acquire(bdi)) break; - BUG_ON(inode->i_state & I_FREEING); + BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); -- cgit v1.2.3-70-g09d2 From 02d5341ae53d32681241b27a40397475caef1c83 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Tue, 16 Jun 2009 15:33:35 -0700 Subject: ntfs: use is_power_of_2() function for clarity. Signed-off-by: Robert P. J. Day Cc: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ntfs/inode.c | 3 ++- fs/ntfs/logfile.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index 82c5085559c..9938034762c 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -27,6 +27,7 @@ #include #include #include +#include #include "aops.h" #include "attrib.h" @@ -1570,7 +1571,7 @@ static int ntfs_read_locked_index_inode(struct inode *base_vi, struct inode *vi) ntfs_debug("Index collation rule is 0x%x.", le32_to_cpu(ir->collation_rule)); ni->itype.index.block_size = le32_to_cpu(ir->index_block_size); - if (ni->itype.index.block_size & (ni->itype.index.block_size - 1)) { + if (!is_power_of_2(ni->itype.index.block_size)) { ntfs_error(vi->i_sb, "Index block size (%u) is not a power of " "two.", ni->itype.index.block_size); goto unm_err_out; diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c index d7932e95b1f..89b02985c05 100644 --- a/fs/ntfs/logfile.c +++ b/fs/ntfs/logfile.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "attrib.h" #include "aops.h" @@ -65,7 +66,7 @@ static bool ntfs_check_restart_page_header(struct inode *vi, logfile_log_page_size < NTFS_BLOCK_SIZE || logfile_system_page_size & (logfile_system_page_size - 1) || - logfile_log_page_size & (logfile_log_page_size - 1)) { + !is_power_of_2(logfile_log_page_size)) { ntfs_error(vi->i_sb, "$LogFile uses unsupported page size."); return false; } -- cgit v1.2.3-70-g09d2 From 4938d7e0233a455f04507bac81d0886c71529537 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Jun 2009 15:33:36 -0700 Subject: poll: avoid extra wakeups in select/poll After introduction of keyed wakeups Davide Libenzi did on epoll, we are able to avoid spurious wakeups in poll()/select() code too. For example, typical use of poll()/select() is to wait for incoming network frames on many sockets. But TX completion for UDP/TCP frames call sock_wfree() which in turn schedules thread. When scheduled, thread does a full scan of all polled fds and can sleep again, because nothing is really available. If number of fds is large, this cause significant load. This patch makes select()/poll() aware of keyed wakeups and useless wakeups are avoided. This reduces number of context switches by about 50% on some setups, and work performed by sofirq handlers. Signed-off-by: Eric Dumazet Acked-by: David S. Miller Acked-by: Andi Kleen Acked-by: Ingo Molnar Acked-by: Davide Libenzi Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/select.c | 40 ++++++++++++++++++++++++++++++++++++---- include/linux/poll.h | 3 +++ 2 files changed, 39 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index 0fe0e1469df..d870237e42c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -168,7 +168,7 @@ static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) return table->entry++; } -static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) { struct poll_wqueues *pwq = wait->private; DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); @@ -194,6 +194,16 @@ static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) return default_wake_function(&dummy_wait, mode, sync, key); } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_table_entry *entry; + + entry = container_of(wait, struct poll_table_entry, wait); + if (key && !((unsigned long)key & entry->key)) + return 0; + return __pollwake(wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) @@ -205,6 +215,7 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, get_file(filp); entry->filp = filp; entry->wait_address = wait_address; + entry->key = p->key; init_waitqueue_func_entry(&entry->wait, pollwake); entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); @@ -362,6 +373,18 @@ get_max: #define POLLOUT_SET (POLLWRBAND | POLLWRNORM | POLLOUT | POLLERR) #define POLLEX_SET (POLLPRI) +static inline void wait_key_set(poll_table *wait, unsigned long in, + unsigned long out, unsigned long bit) +{ + if (wait) { + wait->key = POLLEX_SET; + if (in & bit) + wait->key |= POLLIN_SET; + if (out & bit) + wait->key |= POLLOUT_SET; + } +} + int do_select(int n, fd_set_bits *fds, struct timespec *end_time) { ktime_t expire, *to = NULL; @@ -418,20 +441,25 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) if (file) { f_op = file->f_op; mask = DEFAULT_POLLMASK; - if (f_op && f_op->poll) - mask = (*f_op->poll)(file, retval ? NULL : wait); + if (f_op && f_op->poll) { + wait_key_set(wait, in, out, bit); + mask = (*f_op->poll)(file, wait); + } fput_light(file, fput_needed); if ((mask & POLLIN_SET) && (in & bit)) { res_in |= bit; retval++; + wait = NULL; } if ((mask & POLLOUT_SET) && (out & bit)) { res_out |= bit; retval++; + wait = NULL; } if ((mask & POLLEX_SET) && (ex & bit)) { res_ex |= bit; retval++; + wait = NULL; } } } @@ -685,8 +713,12 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait) mask = POLLNVAL; if (file != NULL) { mask = DEFAULT_POLLMASK; - if (file->f_op && file->f_op->poll) + if (file->f_op && file->f_op->poll) { + if (pwait) + pwait->key = pollfd->events | + POLLERR | POLLHUP; mask = file->f_op->poll(file, pwait); + } /* Mask out unneeded events. */ mask &= pollfd->events | POLLERR | POLLHUP; fput_light(file, fput_needed); diff --git a/include/linux/poll.h b/include/linux/poll.h index 8c24ef8d997..fa287f25138 100644 --- a/include/linux/poll.h +++ b/include/linux/poll.h @@ -32,6 +32,7 @@ typedef void (*poll_queue_proc)(struct file *, wait_queue_head_t *, struct poll_ typedef struct poll_table_struct { poll_queue_proc qproc; + unsigned long key; } poll_table; static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_address, poll_table *p) @@ -43,10 +44,12 @@ static inline void poll_wait(struct file * filp, wait_queue_head_t * wait_addres static inline void init_poll_funcptr(poll_table *pt, poll_queue_proc qproc) { pt->qproc = qproc; + pt->key = ~0UL; /* all events enabled */ } struct poll_table_entry { struct file *filp; + unsigned long key; wait_queue_t wait; wait_queue_head_t *wait_address; }; -- cgit v1.2.3-70-g09d2 From 8b0b1db0133e4218a9b45c09e53793c039edebe1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 16 Jun 2009 15:33:39 -0700 Subject: remove put_cpu_no_resched() put_cpu_no_resched() is an optimization of put_cpu() which unfortunately can cause high latencies. The nfs iostats code uses put_cpu_no_resched() in a code sequence where a reschedule request caused by an interrupt between the get_cpu() and the put_cpu_no_resched() can delay the reschedule for at least HZ. The other users of put_cpu_no_resched() optimize correctly in interrupt code, but there is no real harm in using the put_cpu() function which is an alias for preempt_enable(). The extra check of the preemmpt count is not as critical as the potential source of missing a reschedule. Debugged in the preempt-rt tree and verified in mainline. Impact: remove a high latency source [akpm@linux-foundation.org: build fix] Signed-off-by: Thomas Gleixner Acked-by: Ingo Molnar Cc: Tony Luck Cc: Trond Myklebust Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/kernel/perfmon.c | 2 +- fs/nfs/iostat.h | 6 +++--- include/linux/smp.h | 1 - 3 files changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 8a06dc48059..bdc176cb5e8 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -5595,7 +5595,7 @@ pfm_interrupt_handler(int irq, void *arg) (*pfm_alt_intr_handler->handler)(irq, arg, regs); } - put_cpu_no_resched(); + put_cpu(); return IRQ_HANDLED; } diff --git a/fs/nfs/iostat.h b/fs/nfs/iostat.h index a2ab2529b5c..ceda50aad73 100644 --- a/fs/nfs/iostat.h +++ b/fs/nfs/iostat.h @@ -31,7 +31,7 @@ static inline void nfs_inc_server_stats(const struct nfs_server *server, cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); iostats->events[stat]++; - put_cpu_no_resched(); + put_cpu(); } static inline void nfs_inc_stats(const struct inode *inode, @@ -50,7 +50,7 @@ static inline void nfs_add_server_stats(const struct nfs_server *server, cpu = get_cpu(); iostats = per_cpu_ptr(server->io_stats, cpu); iostats->bytes[stat] += addend; - put_cpu_no_resched(); + put_cpu(); } static inline void nfs_add_stats(const struct inode *inode, @@ -71,7 +71,7 @@ static inline void nfs_add_fscache_stats(struct inode *inode, cpu = get_cpu(); iostats = per_cpu_ptr(NFS_SERVER(inode)->io_stats, cpu); iostats->fscache[stat] += addend; - put_cpu_no_resched(); + put_cpu(); } #endif diff --git a/include/linux/smp.h b/include/linux/smp.h index a69db820eed..9e3d8af0920 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -177,7 +177,6 @@ static inline void init_call_single_data(void) #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() -#define put_cpu_no_resched() preempt_enable_no_resched() /* * Callback to arch code if there's nosmp or maxcpus=0 on the -- cgit v1.2.3-70-g09d2 From 69050eee8e08a6234f29fe71a56f8c7c7d4d7186 Mon Sep 17 00:00:00 2001 From: Tomas Szepe Date: Tue, 16 Jun 2009 15:33:56 -0700 Subject: CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK CONFIG_FILE_LOCKING should not depend on CONFIG_BLOCK. This makes it possible to run complete systems out of a CONFIG_BLOCK=n initramfs on current kernels again (this last worked on 2.6.27.*). Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 525da2e8f73..4044f163035 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -39,6 +39,13 @@ config FS_POSIX_ACL bool default n +source "fs/xfs/Kconfig" +source "fs/gfs2/Kconfig" +source "fs/ocfs2/Kconfig" +source "fs/btrfs/Kconfig" + +endif # BLOCK + config FILE_LOCKING bool "Enable POSIX file locking API" if EMBEDDED default y @@ -47,13 +54,6 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. -source "fs/xfs/Kconfig" -source "fs/gfs2/Kconfig" -source "fs/ocfs2/Kconfig" -source "fs/btrfs/Kconfig" - -endif # BLOCK - source "fs/notify/Kconfig" source "fs/quota/Kconfig" -- cgit v1.2.3-70-g09d2 From 005411c3e9147bc3b78215390e847d688dbbc163 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 16 Jun 2009 21:36:49 +0100 Subject: AFS: Correctly translate auth error aborts and don't failover in such cases Authentication error abort codes should be translated to appropriate Linux error codes, rather than all being translated to EREMOTEIO - which indicates that the server had internal problems. Additionally, a server shouldn't be marked unavailable and the next server tried if an authentication error occurs. This will quickly make all the servers unavailable to the client. Instead the error should be returned straight to the user. Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/misc.c | 16 ++++++++++++++++ fs/afs/vlocation.c | 2 ++ 2 files changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/afs/misc.c b/fs/afs/misc.c index 2d33a5f7d21..0dd4dafee10 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "internal.h" #include "afs_fs.h" @@ -54,6 +55,21 @@ int afs_abort_to_error(u32 abort_code) case 0x2f6df24: return -ENOLCK; case 0x2f6df26: return -ENOTEMPTY; case 0x2f6df78: return -EDQUOT; + + case RXKADINCONSISTENCY: return -EPROTO; + case RXKADPACKETSHORT: return -EPROTO; + case RXKADLEVELFAIL: return -EKEYREJECTED; + case RXKADTICKETLEN: return -EKEYREJECTED; + case RXKADOUTOFSEQUENCE: return -EPROTO; + case RXKADNOAUTH: return -EKEYREJECTED; + case RXKADBADKEY: return -EKEYREJECTED; + case RXKADBADTICKET: return -EKEYREJECTED; + case RXKADUNKNOWNKEY: return -EKEYREJECTED; + case RXKADEXPIRED: return -EKEYEXPIRED; + case RXKADSEALEDINCON: return -EKEYREJECTED; + case RXKADDATALEN: return -EKEYREJECTED; + case RXKADILLEGALLEVEL: return -EKEYREJECTED; + default: return -EREMOTEIO; } } diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index ec2a7431e45..6e689208def 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -65,6 +65,8 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl, goto out; goto rotate; case -ENOMEDIUM: + case -EKEYREJECTED: + case -EKEYEXPIRED: goto out; default: ret = -EIO; -- cgit v1.2.3-70-g09d2 From 9c64daff9d5afb102dfe64a26829e26725538e58 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 8 Jun 2009 15:22:24 -0400 Subject: ext3: avoid unnecessary spinlock in critical POSIX ACL path If a filesystem supports POSIX ACL's, the VFS layer expects the filesystem to do POSIX ACL checks on any files not owned by the caller, and it does this for every single pathname component that it looks up. That obviously can be pretty expensive if the filesystem isn't careful about it, especially with locking. That's doubly sad, since the common case tends to be that there are no ACL's associated with the files in question. ext3 already caches the ACL data so that it doesn't have to look it up over and over again, but it does so by taking the inode->i_lock spinlock on every lookup. Which is a noticeable overhead even if it's a private lock, especially on CPU's where the serialization is expensive (eg Intel Netburst aka 'P4'). For the special case of not actually having any ACL's, all that locking is unnecessary. Even if somebody else were to be changing the ACL's on another CPU, we simply don't care - if we've seen a NULL ACL, we might as well use it. So just load the ACL speculatively without any locking, and if it was NULL, just use it. If it's non-NULL (either because we had a cached entry, or because the cache hasn't been filled in at all), it means that we'll need to get the lock and re-load it properly. This is noticeable even on Nehalem, which does locking quite well (much better than P4). From lmbench: Processor, Processes - times in microseconds - smaller is better -------------------------------------------------------------------- Host OS Mhz null null open slct fork exec sh call I/O stat clos TCP proc proc proc --------- ------------- ---- ---- ---- ---- ---- ---- ---- ---- ---- - before: nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.95 1.45 2.18 69.1 273. 1141 nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.95 1.48 2.28 69.9 253. 1140 nehalem.l Linux 2.6.30- 3193 0.04 0.10 0.95 1.42 2.19 68.6 284. 1141 - after: nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.92 1.44 2.12 68.3 282. 1094 nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.92 1.39 2.20 67.0 308. 1123 nehalem.l Linux 2.6.30- 3193 0.04 0.09 0.92 1.39 2.36 67.4 293. 1148 where you can see what appears to be a roughly 3% improvement in stat and open/close latencies from just the removal of the locking overhead. Of course, this only matters for files you don't own (the owner never needs to do the ACL checks), but that's the common case for libraries, header files, and executables. As well as for the base components of any absolute pathname, even if you are the owner of the final file. [ At some point we probably want to move this ACL caching logic entirely into the VFS layer (and only call down to the filesystem when uncached), but in the meantime this improves ext3 a bit. A similar fix to btrfs makes a much bigger difference (15x improvement in lmbench) due to broken caching. ] Signed-off-by: Linus Torvalds Signed-off-by: "Theodore Ts'o" Acked-by: Jan Kara Cc: Al Viro Signed-off-by: Al Viro --- fs/ext3/acl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext3/acl.c b/fs/ext3/acl.c index d81ef2fdb08..e0c74545171 100644 --- a/fs/ext3/acl.c +++ b/fs/ext3/acl.c @@ -129,12 +129,15 @@ fail: static inline struct posix_acl * ext3_iget_acl(struct inode *inode, struct posix_acl **i_acl) { - struct posix_acl *acl = EXT3_ACL_NOT_CACHED; + struct posix_acl *acl = ACCESS_ONCE(*i_acl); - spin_lock(&inode->i_lock); - if (*i_acl != EXT3_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); + if (acl) { + spin_lock(&inode->i_lock); + acl = *i_acl; + if (acl != EXT3_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } return acl; } -- cgit v1.2.3-70-g09d2 From 210ad6aedb332e73167ece5af9bd47f0da8c2aca Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 8 Jun 2009 15:22:25 -0400 Subject: ext4: avoid unnecessary spinlock in critical POSIX ACL path If a filesystem supports POSIX ACL's, the VFS layer expects the filesystem to do POSIX ACL checks on any files not owned by the caller, and it does this for every single pathname component that it looks up. That obviously can be pretty expensive if the filesystem isn't careful about it, especially with locking. That's doubly sad, since the common case tends to be that there are no ACL's associated with the files in question. ext4 already caches the ACL data so that it doesn't have to look it up over and over again, but it does so by taking the inode->i_lock spinlock on every lookup. Which is a noticeable overhead even if it's a private lock, especially on CPU's where the serialization is expensive (eg Intel Netburst aka 'P4'). For the special case of not actually having any ACL's, all that locking is unnecessary. Even if somebody else were to be changing the ACL's on another CPU, we simply don't care - if we've seen a NULL ACL, we might as well use it. So just load the ACL speculatively without any locking, and if it was NULL, just use it. If it's non-NULL (either because we had a cached entry, or because the cache hasn't been filled in at all), it means that we'll need to get the lock and re-load it properly. (This commit was ported from a patch originally authored by Linus for ext3.) Signed-off-by: "Theodore Ts'o" Signed-off-by: Al Viro --- fs/ext4/acl.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 647e0d65a28..605aeed96d6 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -129,12 +129,15 @@ fail: static inline struct posix_acl * ext4_iget_acl(struct inode *inode, struct posix_acl **i_acl) { - struct posix_acl *acl = EXT4_ACL_NOT_CACHED; + struct posix_acl *acl = ACCESS_ONCE(*i_acl); - spin_lock(&inode->i_lock); - if (*i_acl != EXT4_ACL_NOT_CACHED) - acl = posix_acl_dup(*i_acl); - spin_unlock(&inode->i_lock); + if (acl) { + spin_lock(&inode->i_lock); + acl = *i_acl; + if (acl != EXT4_ACL_NOT_CACHED) + acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); + } return acl; } -- cgit v1.2.3-70-g09d2 From b0895513f499b8f786d292ce48589ca210ca1d6e Mon Sep 17 00:00:00 2001 From: "J. R. Okajima" Date: Wed, 17 Jun 2009 01:16:50 +0900 Subject: remove unlock_kernel() left accidentally commit 337eb00a2c3a421999c39c94ce7e33545ee8baa7 Push BKL down into ->remount_fs() and commit 4aa98cf768b6f2ea4b204620d949a665959214f6 Push BKL down into do_remount_sb() were uncorrectly merged. The former removes one pair of lock/unlock_kernel(), but the latter adds several unlock_kernel(). Finally a few unlock_kernel() calls left. Signed-off-by: J. R. Okajima Signed-off-by: Al Viro --- fs/super.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 83b47416d00..d40d53a22fb 100644 --- a/fs/super.c +++ b/fs/super.c @@ -545,24 +545,18 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) if ((flags & MS_RDONLY) && !(sb->s_flags & MS_RDONLY)) { if (force) mark_files_ro(sb); - else if (!fs_may_remount_ro(sb)) { - unlock_kernel(); + else if (!fs_may_remount_ro(sb)) return -EBUSY; - } retval = vfs_dq_off(sb, 1); - if (retval < 0 && retval != -ENOSYS) { - unlock_kernel(); + if (retval < 0 && retval != -ENOSYS) return -EBUSY; - } } remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { retval = sb->s_op->remount_fs(sb, &flags, data); - if (retval) { - unlock_kernel(); + if (retval) return retval; - } } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); if (remount_rw) -- cgit v1.2.3-70-g09d2 From fe36adf47eb1f7f4972559efa30ce3d2d3f977f2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 13:35:01 -0400 Subject: No instance of ->bmap() needs BKL Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 +- fs/ioctl.c | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 3120f8dd2c3..229d7b7c50a 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -187,7 +187,7 @@ readpages: no write_begin: no locks the page yes write_end: no yes, unlocks yes perform_write: no n/a yes -bmap: yes +bmap: no invalidatepage: no yes releasepage: no yes direct_IO: no diff --git a/fs/ioctl.c b/fs/ioctl.c index 286f38dfc6c..001f8d3118f 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -70,9 +70,7 @@ static int ioctl_fibmap(struct file *filp, int __user *p) res = get_user(block, p); if (res) return res; - lock_kernel(); res = mapping->a_ops->bmap(mapping, block); - unlock_kernel(); return put_user(res, p); } -- cgit v1.2.3-70-g09d2 From 66c6af2e8ba55d4d6691c136b42f2423ab9598ec Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 14:15:00 -0400 Subject: fuse doesn't need BKL in ->umount_begin() Signed-off-by: Al Viro --- fs/fuse/inode.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index f0df55a5292..d8673ccf90b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -19,7 +19,6 @@ #include #include #include -#include MODULE_AUTHOR("Miklos Szeredi "); MODULE_DESCRIPTION("Filesystem in Userspace"); @@ -260,9 +259,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, static void fuse_umount_begin(struct super_block *sb) { - lock_kernel(); fuse_abort_conn(get_fuse_conn_super(sb)); - unlock_kernel(); } static void fuse_send_destroy(struct fuse_conn *fc) -- cgit v1.2.3-70-g09d2 From ee450f796f6c4f3a563c914cb93ccfa91a1f7580 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 14:17:21 -0400 Subject: 9P doesn't need BKL in ->umount_begin() Signed-off-by: Al Viro --- fs/9p/vfs_super.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index ab5547ff29a..38d695d66a0 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -37,7 +37,6 @@ #include #include #include -#include #include #include @@ -231,10 +230,8 @@ v9fs_umount_begin(struct super_block *sb) { struct v9fs_session_info *v9ses; - lock_kernel(); v9ses = sb->s_fs_info; v9fs_session_cancel(v9ses); - unlock_kernel(); } static const struct super_operations v9fs_super_ops = { -- cgit v1.2.3-70-g09d2 From 608ba50bd0225d95469154feba8f00a6457848c1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 14:52:13 -0400 Subject: Cleanup of adfs headers Signed-off-by: Al Viro --- fs/adfs/adfs.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++ fs/adfs/dir.c | 8 ------- fs/adfs/dir_f.c | 8 ------- fs/adfs/dir_fplus.c | 8 ------- fs/adfs/file.c | 4 ---- fs/adfs/inode.c | 10 --------- fs/adfs/map.c | 6 ----- fs/adfs/super.c | 17 ++------------ include/linux/adfs_fs.h | 13 ----------- include/linux/adfs_fs_i.h | 24 -------------------- include/linux/adfs_fs_sb.h | 38 -------------------------------- 11 files changed, 57 insertions(+), 134 deletions(-) delete mode 100644 include/linux/adfs_fs_i.h delete mode 100644 include/linux/adfs_fs_sb.h (limited to 'fs') diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index a6665f37f45..9cc18775b83 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -1,3 +1,6 @@ +#include +#include + /* Internal data structures for ADFS */ #define ADFS_FREE_FRAG 0 @@ -16,6 +19,58 @@ struct buffer_head; +/* + * adfs file system inode data in memory + */ +struct adfs_inode_info { + loff_t mmu_private; + unsigned long parent_id; /* object id of parent */ + __u32 loadaddr; /* RISC OS load address */ + __u32 execaddr; /* RISC OS exec address */ + unsigned int filetype; /* RISC OS file type */ + unsigned int attr; /* RISC OS permissions */ + unsigned int stamped:1; /* RISC OS file has date/time */ + struct inode vfs_inode; +}; + +/* + * Forward-declare this + */ +struct adfs_discmap; +struct adfs_dir_ops; + +/* + * ADFS file system superblock data in memory + */ +struct adfs_sb_info { + struct adfs_discmap *s_map; /* bh list containing map */ + struct adfs_dir_ops *s_dir; /* directory operations */ + + uid_t s_uid; /* owner uid */ + gid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ + + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector */ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ + unsigned int s_namelen; /* maximum number of characters in name */ +}; + +static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline struct adfs_inode_info *ADFS_I(struct inode *inode) +{ + return container_of(inode, struct adfs_inode_info, vfs_inode); +} + /* * Directory handling */ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 4d4073447d1..23aa52f548a 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -9,15 +9,7 @@ * * Common directory handling for ADFS */ -#include -#include -#include -#include -#include -#include #include -#include /* for file_fsync() */ - #include "adfs.h" /* diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 31df6adf0de..bafc71222e2 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -9,15 +9,7 @@ * * E and F format directory handling */ -#include -#include -#include -#include -#include -#include #include -#include - #include "adfs.h" #include "dir_f.h" diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index 139e0f345f1..1796bb352d0 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -7,15 +7,7 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include #include -#include - #include "adfs.h" #include "dir_fplus.h" diff --git a/fs/adfs/file.c b/fs/adfs/file.c index 8224d54a2af..005ea34d175 100644 --- a/fs/adfs/file.c +++ b/fs/adfs/file.c @@ -19,10 +19,6 @@ * * adfs regular file handling primitives */ -#include -#include /* for file_fsync() */ -#include - #include "adfs.h" const struct file_operations adfs_file_operations = { diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c index 05b3a677201..798cb071d13 100644 --- a/fs/adfs/inode.c +++ b/fs/adfs/inode.c @@ -7,17 +7,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include -#include -#include -#include #include -#include #include - #include "adfs.h" /* @@ -395,4 +386,3 @@ int adfs_write_inode(struct inode *inode, int wait) unlock_kernel(); return ret; } -MODULE_LICENSE("GPL"); diff --git a/fs/adfs/map.c b/fs/adfs/map.c index 568081b93f7..d1a5932bb0f 100644 --- a/fs/adfs/map.c +++ b/fs/adfs/map.c @@ -7,14 +7,8 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include -#include -#include -#include #include - #include - #include "adfs.h" /* diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 0ec5aaf47aa..aad92f0a104 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -8,26 +8,12 @@ * published by the Free Software Foundation. */ #include -#include -#include -#include -#include -#include -#include -#include #include #include -#include #include -#include #include #include - -#include -#include - -#include - +#include #include "adfs.h" #include "dir_f.h" #include "dir_fplus.h" @@ -534,3 +520,4 @@ static void __exit exit_adfs_fs(void) module_init(init_adfs_fs) module_exit(exit_adfs_fs) +MODULE_LICENSE("GPL"); diff --git a/include/linux/adfs_fs.h b/include/linux/adfs_fs.h index ef788c2085a..b19801f7389 100644 --- a/include/linux/adfs_fs.h +++ b/include/linux/adfs_fs.h @@ -41,8 +41,6 @@ struct adfs_discrecord { #define ADFS_DR_SIZE_BITS (ADFS_DR_SIZE << 3) #ifdef __KERNEL__ -#include -#include /* * Calculate the boot block checksum on an ADFS drive. Note that this will * appear to be correct if the sector contains all zeros, so also check that @@ -60,17 +58,6 @@ static inline int adfs_checkbblk(unsigned char *ptr) return (result & 0xff) != ptr[511]; } - -static inline struct adfs_sb_info *ADFS_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - -static inline struct adfs_inode_info *ADFS_I(struct inode *inode) -{ - return container_of(inode, struct adfs_inode_info, vfs_inode); -} - #endif #endif diff --git a/include/linux/adfs_fs_i.h b/include/linux/adfs_fs_i.h deleted file mode 100644 index cb543034e54..00000000000 --- a/include/linux/adfs_fs_i.h +++ /dev/null @@ -1,24 +0,0 @@ -/* - * linux/include/linux/adfs_fs_i.h - * - * Copyright (C) 1997 Russell King - */ - -#ifndef _ADFS_FS_I -#define _ADFS_FS_I - -/* - * adfs file system inode data in memory - */ -struct adfs_inode_info { - loff_t mmu_private; - unsigned long parent_id; /* object id of parent */ - __u32 loadaddr; /* RISC OS load address */ - __u32 execaddr; /* RISC OS exec address */ - unsigned int filetype; /* RISC OS file type */ - unsigned int attr; /* RISC OS permissions */ - unsigned int stamped:1; /* RISC OS file has date/time */ - struct inode vfs_inode; -}; - -#endif diff --git a/include/linux/adfs_fs_sb.h b/include/linux/adfs_fs_sb.h deleted file mode 100644 index d9bf05c02cc..00000000000 --- a/include/linux/adfs_fs_sb.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * linux/include/linux/adfs_fs_sb.h - * - * Copyright (C) 1997-1999 Russell King - */ - -#ifndef _ADFS_FS_SB -#define _ADFS_FS_SB - -/* - * Forward-declare this - */ -struct adfs_discmap; -struct adfs_dir_ops; - -/* - * ADFS file system superblock data in memory - */ -struct adfs_sb_info { - struct adfs_discmap *s_map; /* bh list containing map */ - struct adfs_dir_ops *s_dir; /* directory operations */ - - uid_t s_uid; /* owner uid */ - gid_t s_gid; /* owner gid */ - umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ - umode_t s_other_mask; /* ADFS other perm -> unix perm */ - - __u32 s_ids_per_zone; /* max. no ids in one zone */ - __u32 s_idlen; /* length of ID in map */ - __u32 s_map_size; /* sector size of a map */ - unsigned long s_size; /* total size (in blocks) of this fs */ - signed int s_map2blk; /* shift left by this for map->sector */ - unsigned int s_log2sharesize;/* log2 share size */ - __le32 s_version; /* disc format version */ - unsigned int s_namelen; /* maximum number of characters in name */ -}; - -#endif -- cgit v1.2.3-70-g09d2 From 536c94901eb8f2eb6fccf81ae6be814899a9f6e8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 23:24:50 -0400 Subject: befs ->pust_super() doesn't need BKL Signed-off-by: Al Viro --- fs/befs/linuxvfs.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 9367b6297d8..02c06138bc6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -737,8 +737,6 @@ parse_options(char *options, befs_mount_options * opts) static void befs_put_super(struct super_block *sb) { - lock_kernel(); - kfree(BEFS_SB(sb)->mount_opts.iocharset); BEFS_SB(sb)->mount_opts.iocharset = NULL; @@ -749,8 +747,6 @@ befs_put_super(struct super_block *sb) kfree(sb->s_fs_info); sb->s_fs_info = NULL; - - unlock_kernel(); } /* Allocate private field of the superblock, fill it. -- cgit v1.2.3-70-g09d2 From e7ec952f6aa6ac1649ac49eb5e4de5b92c829d1e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 23:35:46 -0400 Subject: get rid of BKL in fs/efs Only readdir() really needed it, and that's easily fixable by switch to generic_file_llseek() Signed-off-by: Al Viro --- fs/efs/dir.c | 5 +---- fs/efs/namei.c | 9 +-------- fs/efs/symlink.c | 7 +------ 3 files changed, 3 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 49308a29798..7ee6f7e3a60 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -5,12 +5,12 @@ */ #include -#include #include "efs.h" static int efs_readdir(struct file *, void *, filldir_t); const struct file_operations efs_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = efs_readdir, }; @@ -33,8 +33,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { if (inode->i_size & (EFS_DIRBSIZE-1)) printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); - lock_kernel(); - /* work out where this entry can be found */ block = filp->f_pos >> EFS_DIRBSIZE_BITS; @@ -107,7 +105,6 @@ static int efs_readdir(struct file *filp, void *dirent, filldir_t filldir) { filp->f_pos = (block << EFS_DIRBSIZE_BITS) | slot; out: - unlock_kernel(); return 0; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index c3fb5f9c4a4..1511bf9e5f8 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -8,7 +8,6 @@ #include #include -#include #include #include "efs.h" @@ -63,16 +62,12 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei efs_ino_t inodenum; struct inode * inode = NULL; - lock_kernel(); inodenum = efs_find_entry(dir, dentry->d_name.name, dentry->d_name.len); if (inodenum) { inode = efs_iget(dir->i_sb, inodenum); - if (IS_ERR(inode)) { - unlock_kernel(); + if (IS_ERR(inode)) return ERR_CAST(inode); - } } - unlock_kernel(); return d_splice_alias(inode, dentry); } @@ -115,11 +110,9 @@ struct dentry *efs_get_parent(struct dentry *child) struct dentry *parent = ERR_PTR(-ENOENT); efs_ino_t ino; - lock_kernel(); ino = efs_find_entry(child->d_inode, "..", 2); if (ino) parent = d_obtain_alias(efs_iget(child->d_inode->i_sb, ino)); - unlock_kernel(); return parent; } diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index 41911ec83aa..75117d0dac2 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -9,7 +9,6 @@ #include #include #include -#include #include "efs.h" static int efs_symlink_readpage(struct file *file, struct page *page) @@ -22,9 +21,8 @@ static int efs_symlink_readpage(struct file *file, struct page *page) err = -ENAMETOOLONG; if (size > 2 * EFS_BLOCKSIZE) - goto fail_notlocked; + goto fail; - lock_kernel(); /* read first 512 bytes of link target */ err = -EIO; bh = sb_bread(inode->i_sb, efs_bmap(inode, 0)); @@ -40,14 +38,11 @@ static int efs_symlink_readpage(struct file *file, struct page *page) brelse(bh); } link[size] = '\0'; - unlock_kernel(); SetPageUptodate(page); kunmap(page); unlock_page(page); return 0; fail: - unlock_kernel(); -fail_notlocked: SetPageError(page); kunmap(page); unlock_page(page); -- cgit v1.2.3-70-g09d2 From cc46759a8c0ac4c6f13aa4b0f470305c05f600e1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 23:47:45 -0400 Subject: get rid of BKL in fs/minix Signed-off-by: Al Viro --- fs/minix/bitmap.c | 25 +++++++++++++------------ fs/minix/dir.c | 5 +---- fs/minix/inode.c | 4 ---- 3 files changed, 14 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/minix/bitmap.c b/fs/minix/bitmap.c index 3aebe322271..6ac693faae4 100644 --- a/fs/minix/bitmap.c +++ b/fs/minix/bitmap.c @@ -12,13 +12,14 @@ /* bitmap.c contains the code that handles the inode and block bitmaps */ #include "minix.h" -#include #include #include #include static const int nibblemap[] = { 4,3,3,2,3,2,2,1,3,2,2,1,2,1,1,0 }; +static DEFINE_SPINLOCK(bitmap_lock); + static unsigned long count_free(struct buffer_head *map[], unsigned numblocks, __u32 numbits) { unsigned i, j, sum = 0; @@ -69,11 +70,11 @@ void minix_free_block(struct inode *inode, unsigned long block) return; } bh = sbi->s_zmap[zone]; - lock_kernel(); + spin_lock(&bitmap_lock); if (!minix_test_and_clear_bit(bit, bh->b_data)) printk("minix_free_block (%s:%lu): bit already cleared\n", sb->s_id, block); - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); return; } @@ -88,18 +89,18 @@ int minix_new_block(struct inode * inode) struct buffer_head *bh = sbi->s_zmap[i]; int j; - lock_kernel(); + spin_lock(&bitmap_lock); j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); if (j < bits_per_zone) { minix_set_bit(j, bh->b_data); - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone + sbi->s_firstdatazone-1; if (j < sbi->s_firstdatazone || j >= sbi->s_nzones) break; return j; } - unlock_kernel(); + spin_unlock(&bitmap_lock); } return 0; } @@ -211,10 +212,10 @@ void minix_free_inode(struct inode * inode) minix_clear_inode(inode); /* clear on-disk copy */ bh = sbi->s_imap[ino]; - lock_kernel(); + spin_lock(&bitmap_lock); if (!minix_test_and_clear_bit(bit, bh->b_data)) printk("minix_free_inode: bit %lu already cleared\n", bit); - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); out: clear_inode(inode); /* clear in-memory copy */ @@ -237,7 +238,7 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) j = bits_per_zone; bh = NULL; *error = -ENOSPC; - lock_kernel(); + spin_lock(&bitmap_lock); for (i = 0; i < sbi->s_imap_blocks; i++) { bh = sbi->s_imap[i]; j = minix_find_first_zero_bit(bh->b_data, bits_per_zone); @@ -245,17 +246,17 @@ struct inode * minix_new_inode(const struct inode * dir, int * error) break; } if (!bh || j >= bits_per_zone) { - unlock_kernel(); + spin_unlock(&bitmap_lock); iput(inode); return NULL; } if (minix_test_and_set_bit(j, bh->b_data)) { /* shouldn't happen */ - unlock_kernel(); + spin_unlock(&bitmap_lock); printk("minix_new_inode: bit already set\n"); iput(inode); return NULL; } - unlock_kernel(); + spin_unlock(&bitmap_lock); mark_buffer_dirty(bh); j += i * bits_per_zone; if (!j || j > sbi->s_ninodes) { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index e5f206467e4..d407e7a0b6f 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -11,7 +11,6 @@ #include "minix.h" #include #include -#include #include typedef struct minix_dir_entry minix_dirent; @@ -20,6 +19,7 @@ typedef struct minix3_dir_entry minix3_dirent; static int minix_readdir(struct file *, void *, filldir_t); const struct file_operations minix_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = minix_readdir, .fsync = simple_fsync, @@ -102,8 +102,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) char *name; __u32 inumber; - lock_kernel(); - pos = (pos + chunk_size-1) & ~(chunk_size-1); if (pos >= inode->i_size) goto done; @@ -146,7 +144,6 @@ static int minix_readdir(struct file * filp, void * dirent, filldir_t filldir) done: filp->f_pos = (n << PAGE_CACHE_SHIFT) | offset; - unlock_kernel(); return 0; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index f91a2369359..74ea82d7216 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -35,8 +35,6 @@ static void minix_put_super(struct super_block *sb) int i; struct minix_sb_info *sbi = minix_sb(sb); - lock_kernel(); - if (!(sb->s_flags & MS_RDONLY)) { if (sbi->s_version != MINIX_V3) /* s_state is now out from V3 sb */ sbi->s_ms->s_state = sbi->s_mount_state; @@ -50,8 +48,6 @@ static void minix_put_super(struct super_block *sb) kfree(sbi->s_imap); sb->s_fs_info = NULL; kfree(sbi); - - unlock_kernel(); } static struct kmem_cache * minix_inode_cachep; -- cgit v1.2.3-70-g09d2 From 5ac3455a843d2ca77333c954eea83aa4514c8199 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2009 23:59:37 -0400 Subject: get rid of BKL in fs/sysv Signed-off-by: Al Viro --- fs/sysv/dir.c | 5 +---- fs/sysv/inode.c | 11 ----------- 2 files changed, 1 insertion(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index c7798079e64..4e50286a4cc 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -15,13 +15,13 @@ #include #include -#include #include #include "sysv.h" static int sysv_readdir(struct file *, void *, filldir_t); const struct file_operations sysv_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = sysv_readdir, .fsync = simple_fsync, @@ -74,8 +74,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) unsigned long n = pos >> PAGE_CACHE_SHIFT; unsigned long npages = dir_pages(inode); - lock_kernel(); - pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); if (pos >= inode->i_size) goto done; @@ -113,7 +111,6 @@ static int sysv_readdir(struct file * filp, void * dirent, filldir_t filldir) done: filp->f_pos = ((loff_t)n << PAGE_CACHE_SHIFT) | offset; - unlock_kernel(); return 0; } diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 479923456a5..9824743832a 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -21,7 +21,6 @@ * the superblock. */ -#include #include #include #include @@ -37,7 +36,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait) unsigned long time = get_seconds(), old_time; lock_super(sb); - lock_kernel(); /* * If we are going to write out the super block, @@ -52,7 +50,6 @@ static int sysv_sync_fs(struct super_block *sb, int wait) mark_buffer_dirty(sbi->s_bh2); } - unlock_kernel(); unlock_super(sb); return 0; @@ -82,8 +79,6 @@ static void sysv_put_super(struct super_block *sb) { struct sysv_sb_info *sbi = SYSV_SB(sb); - lock_kernel(); - if (sb->s_dirt) sysv_write_super(sb); @@ -99,8 +94,6 @@ static void sysv_put_super(struct super_block *sb) brelse(sbi->s_bh2); kfree(sbi); - - unlock_kernel(); } static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) @@ -275,7 +268,6 @@ int sysv_write_inode(struct inode *inode, int wait) return -EIO; } - lock_kernel(); raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(inode->i_uid)); raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(inode->i_gid)); @@ -291,7 +283,6 @@ int sysv_write_inode(struct inode *inode, int wait) for (block = 0; block < 10+1+1+1; block++) write3byte(sbi, (u8 *)&si->i_data[block], &raw_inode->i_data[3*block]); - unlock_kernel(); mark_buffer_dirty(bh); if (wait) { sync_dirty_buffer(bh); @@ -315,9 +306,7 @@ static void sysv_delete_inode(struct inode *inode) truncate_inode_pages(&inode->i_data, 0); inode->i_size = 0; sysv_truncate(inode); - lock_kernel(); sysv_free_inode(inode); - unlock_kernel(); } static struct kmem_cache *sysv_inode_cachep; -- cgit v1.2.3-70-g09d2 From 852969b2d273e77dabbc22e1c1058cbafb7ad7d2 Mon Sep 17 00:00:00 2001 From: David Daney Date: Wed, 27 May 2009 17:47:45 -0700 Subject: Hugetlbfs: Enable hugetlbfs for more systems in Kconfig. As part of adding hugetlbfs support for MIPS, I am adding a new kconfig variable 'SYS_SUPPORTS_HUGETLBFS'. Since some mips cpu varients don't yet support it, we can enable selection of HUGETLBFS on a system by system basis from the arch/mips/Kconfig. Signed-off-by: David Daney CC: William Irwin Signed-off-by: Ralf Baechle --- fs/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index 4044f163035..d78e950402c 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -134,7 +134,7 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ - (S390 && 64BIT) || BROKEN + (S390 && 64BIT) || SYS_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read -- cgit v1.2.3-70-g09d2