From d7eecb483cc29e929bbc5515b8def830d7fc6ad2 Mon Sep 17 00:00:00 2001 From: Daniel Mack Date: Thu, 28 Jan 2010 16:13:01 +0800 Subject: jfs_dmap.[ch]: trivial typo fix: s/heigth/height/g MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Daniel Mack Signed-off-by: Dave Kleikamp Cc: Jiri Kosina Cc: André Goddard Rosa Cc: jfs-discussion@lists.sourceforge.net --- fs/jfs/jfs_dmap.c | 16 ++++++++-------- fs/jfs/jfs_dmap.h | 6 +++--- 2 files changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index d9b031cf69f..7e19d2fe009 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -195,7 +195,7 @@ int dbMount(struct inode *ipbmap) bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); - bmp->db_agheigth = le32_to_cpu(dbmp_le->dn_agheigth); + bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); @@ -287,7 +287,7 @@ int dbSync(struct inode *ipbmap) dbmp_le->dn_maxag = cpu_to_le32(bmp->db_maxag); dbmp_le->dn_agpref = cpu_to_le32(bmp->db_agpref); dbmp_le->dn_aglevel = cpu_to_le32(bmp->db_aglevel); - dbmp_le->dn_agheigth = cpu_to_le32(bmp->db_agheigth); + dbmp_le->dn_agheight = cpu_to_le32(bmp->db_agheight); dbmp_le->dn_agwidth = cpu_to_le32(bmp->db_agwidth); dbmp_le->dn_agstart = cpu_to_le32(bmp->db_agstart); dbmp_le->dn_agl2size = cpu_to_le32(bmp->db_agl2size); @@ -1440,7 +1440,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * tree index of this allocation group within the control page. */ agperlev = - (1 << (L2LPERCTL - (bmp->db_agheigth << 1))) / bmp->db_agwidth; + (1 << (L2LPERCTL - (bmp->db_agheight << 1))) / bmp->db_agwidth; ti = bmp->db_agstart + bmp->db_agwidth * (agno & (agperlev - 1)); /* dmap control page trees fan-out by 4 and a single allocation @@ -1459,7 +1459,7 @@ dbAllocAG(struct bmap * bmp, int agno, s64 nblocks, int l2nb, s64 * results) * the subtree to find the leftmost leaf that describes this * free space. */ - for (k = bmp->db_agheigth; k > 0; k--) { + for (k = bmp->db_agheight; k > 0; k--) { for (n = 0, m = (ti << 2) + 1; n < 4; n++) { if (l2nb <= dcp->stree[m + n]) { ti = m + n; @@ -3606,7 +3606,7 @@ void dbFinalizeBmap(struct inode *ipbmap) } /* - * compute db_aglevel, db_agheigth, db_width, db_agstart: + * compute db_aglevel, db_agheight, db_width, db_agstart: * an ag is covered in aglevel dmapctl summary tree, * at agheight level height (from leaf) with agwidth number of nodes * each, which starts at agstart index node of the smmary tree node @@ -3615,9 +3615,9 @@ void dbFinalizeBmap(struct inode *ipbmap) bmp->db_aglevel = BMAPSZTOLEV(bmp->db_agsize); l2nl = bmp->db_agl2size - (L2BPERDMAP + bmp->db_aglevel * L2LPERCTL); - bmp->db_agheigth = l2nl >> 1; - bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheigth << 1)); - for (i = 5 - bmp->db_agheigth, bmp->db_agstart = 0, n = 1; i > 0; + bmp->db_agheight = l2nl >> 1; + bmp->db_agwidth = 1 << (l2nl - (bmp->db_agheight << 1)); + for (i = 5 - bmp->db_agheight, bmp->db_agstart = 0, n = 1; i > 0; i--) { bmp->db_agstart += n; n <<= 2; diff --git a/fs/jfs/jfs_dmap.h b/fs/jfs/jfs_dmap.h index 1a6eb41569b..6dcb906c55d 100644 --- a/fs/jfs/jfs_dmap.h +++ b/fs/jfs/jfs_dmap.h @@ -210,7 +210,7 @@ struct dbmap_disk { __le32 dn_maxag; /* 4: max active alloc group number */ __le32 dn_agpref; /* 4: preferred alloc group (hint) */ __le32 dn_aglevel; /* 4: dmapctl level holding the AG */ - __le32 dn_agheigth; /* 4: height in dmapctl of the AG */ + __le32 dn_agheight; /* 4: height in dmapctl of the AG */ __le32 dn_agwidth; /* 4: width in dmapctl of the AG */ __le32 dn_agstart; /* 4: start tree index at AG height */ __le32 dn_agl2size; /* 4: l2 num of blks per alloc group */ @@ -229,7 +229,7 @@ struct dbmap { int dn_maxag; /* max active alloc group number */ int dn_agpref; /* preferred alloc group (hint) */ int dn_aglevel; /* dmapctl level holding the AG */ - int dn_agheigth; /* height in dmapctl of the AG */ + int dn_agheight; /* height in dmapctl of the AG */ int dn_agwidth; /* width in dmapctl of the AG */ int dn_agstart; /* start tree index at AG height */ int dn_agl2size; /* l2 num of blks per alloc group */ @@ -255,7 +255,7 @@ struct bmap { #define db_agsize db_bmap.dn_agsize #define db_agl2size db_bmap.dn_agl2size #define db_agwidth db_bmap.dn_agwidth -#define db_agheigth db_bmap.dn_agheigth +#define db_agheight db_bmap.dn_agheight #define db_agstart db_bmap.dn_agstart #define db_numag db_bmap.dn_numag #define db_maxlevel db_bmap.dn_maxlevel -- cgit v1.2.3-70-g09d2 From 8bf8c376ab2eefaf0386f4e003e720e1434fa43d Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Wed, 3 Mar 2010 06:28:06 +0300 Subject: blkdev: fix merge_bvec_fn return value checks v2 merge_bvec_fn() returns bvec->bv_len on success. So we have to check against this value. But in case of fs_optimization merge we compare with wrong value. This patch must be included in b428cd6da7e6559aca69aa2e3a526037d3f20403 But accidentally i've forgot to add this in the initial patch. To make things straight let's replace all such checks. In fact this makes code easy to understand. Signed-off-by: Dmitry Monakhov Signed-off-by: Jens Axboe --- fs/bio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index dc17afd672e..d89e0199dc0 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -555,7 +555,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page .bi_rw = bio->bi_rw, }; - if (q->merge_bvec_fn(q, &bvm, prev) < len) { + if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) { prev->bv_len -= len; return 0; } @@ -608,7 +608,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * merge_bvec_fn() returns number of bytes it can accept * at this offset */ - if (q->merge_bvec_fn(q, &bvm, bvec) < len) { + if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) { bvec->bv_page = NULL; bvec->bv_len = 0; bvec->bv_offset = 0; -- cgit v1.2.3-70-g09d2 From f11c9c5c259cb2c3d698548dc3936f773ab1f5b9 Mon Sep 17 00:00:00 2001 From: Edward Shishkin Date: Thu, 11 Mar 2010 14:09:47 -0800 Subject: vfs: improve writeback_inodes_wb() Do not pin/unpin superblock for every inode in writeback_inodes_wb(), pin it for the whole group of inodes which belong to the same superblock and call writeback_sb_inodes() handler for them. Signed-off-by: Edward Shishkin Cc: Jens Axboe Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/fs-writeback.c | 133 +++++++++++++++++++++++++--------------------- include/linux/writeback.h | 3 ++ 2 files changed, 76 insertions(+), 60 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76fc4d594ac..6841effa47c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -553,108 +553,85 @@ select_queue: return ret; } -static void unpin_sb_for_writeback(struct super_block **psb) +static void unpin_sb_for_writeback(struct super_block *sb) { - struct super_block *sb = *psb; - - if (sb) { - up_read(&sb->s_umount); - put_super(sb); - *psb = NULL; - } + up_read(&sb->s_umount); + put_super(sb); } +enum sb_pin_state { + SB_PINNED, + SB_NOT_PINNED, + SB_PIN_FAILED +}; + /* * For WB_SYNC_NONE writeback, the caller does not have the sb pinned * before calling writeback. So make sure that we do pin it, so it doesn't * go away while we are writing inodes from it. - * - * Returns 0 if the super was successfully pinned (or pinning wasn't needed), - * 1 if we failed. */ -static int pin_sb_for_writeback(struct writeback_control *wbc, - struct inode *inode, struct super_block **psb) +static enum sb_pin_state pin_sb_for_writeback(struct writeback_control *wbc, + struct super_block *sb) { - struct super_block *sb = inode->i_sb; - - /* - * If this sb is already pinned, nothing more to do. If not and - * *psb is non-NULL, unpin the old one first - */ - if (sb == *psb) - return 0; - else if (*psb) - unpin_sb_for_writeback(psb); - /* * Caller must already hold the ref for this */ if (wbc->sync_mode == WB_SYNC_ALL) { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - return 0; + return SB_NOT_PINNED; } - spin_lock(&sb_lock); sb->s_count++; if (down_read_trylock(&sb->s_umount)) { if (sb->s_root) { spin_unlock(&sb_lock); - goto pinned; + return SB_PINNED; } /* * umounted, drop rwsem again and fall through to failure */ up_read(&sb->s_umount); } - sb->s_count--; spin_unlock(&sb_lock); - return 1; -pinned: - *psb = sb; - return 0; + return SB_PIN_FAILED; } -static void writeback_inodes_wb(struct bdi_writeback *wb, - struct writeback_control *wbc) +/* + * Write a portion of b_io inodes which belong to @sb. + * If @wbc->sb != NULL, then find and write all such + * inodes. Otherwise write only ones which go sequentially + * in reverse order. + * Return 1, if the caller writeback routine should be + * interrupted. Otherwise return 0. + */ +static int writeback_sb_inodes(struct super_block *sb, + struct bdi_writeback *wb, + struct writeback_control *wbc) { - struct super_block *sb = wbc->sb, *pin_sb = NULL; - const unsigned long start = jiffies; /* livelock avoidance */ - - spin_lock(&inode_lock); - - if (!wbc->for_kupdate || list_empty(&wb->b_io)) - queue_io(wb, wbc->older_than_this); - while (!list_empty(&wb->b_io)) { - struct inode *inode = list_entry(wb->b_io.prev, - struct inode, i_list); long pages_skipped; - - /* - * super block given and doesn't match, skip this inode - */ - if (sb && sb != inode->i_sb) { + struct inode *inode = list_entry(wb->b_io.prev, + struct inode, i_list); + if (wbc->sb && sb != inode->i_sb) { + /* super block given and doesn't + match, skip this inode */ redirty_tail(inode); continue; } - + if (sb != inode->i_sb) + /* finish with this superblock */ + return 0; if (inode->i_state & (I_NEW | I_WILL_FREE)) { requeue_io(inode); continue; } - /* * Was this inode dirtied after sync_sb_inodes was called? * This keeps sync from extra jobs and livelock. */ - if (inode_dirtied_after(inode, start)) - break; - - if (pin_sb_for_writeback(wbc, inode, &pin_sb)) { - requeue_io(inode); - continue; - } + if (inode_dirtied_after(inode, wbc->wb_start)) + return 1; BUG_ON(inode->i_state & (I_FREEING | I_CLEAR)); __iget(inode); @@ -673,14 +650,50 @@ static void writeback_inodes_wb(struct bdi_writeback *wb, spin_lock(&inode_lock); if (wbc->nr_to_write <= 0) { wbc->more_io = 1; - break; + return 1; } if (!list_empty(&wb->b_more_io)) wbc->more_io = 1; } + /* b_io is empty */ + return 1; +} + +static void writeback_inodes_wb(struct bdi_writeback *wb, + struct writeback_control *wbc) +{ + int ret = 0; - unpin_sb_for_writeback(&pin_sb); + wbc->wb_start = jiffies; /* livelock avoidance */ + spin_lock(&inode_lock); + if (!wbc->for_kupdate || list_empty(&wb->b_io)) + queue_io(wb, wbc->older_than_this); + + while (!list_empty(&wb->b_io)) { + struct inode *inode = list_entry(wb->b_io.prev, + struct inode, i_list); + struct super_block *sb = inode->i_sb; + enum sb_pin_state state; + + if (wbc->sb && sb != wbc->sb) { + /* super block given and doesn't + match, skip this inode */ + redirty_tail(inode); + continue; + } + state = pin_sb_for_writeback(wbc, sb); + + if (state == SB_PIN_FAILED) { + requeue_io(inode); + continue; + } + ret = writeback_sb_inodes(sb, wb, wbc); + if (state == SB_PINNED) + unpin_sb_for_writeback(sb); + if (ret) + break; + } spin_unlock(&inode_lock); /* Leave any unwritten inodes on b_io */ } diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 76e8903cd20..36520ded3e0 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -34,6 +34,9 @@ struct writeback_control { enum writeback_sync_modes sync_mode; unsigned long *older_than_this; /* If !NULL, only write back inodes older than this */ + unsigned long wb_start; /* Time writeback_inodes_wb was + called. This is needed to avoid + extra jobs and livelock */ long nr_to_write; /* Write this many pages, and decrement this for each page written */ long pages_skipped; /* Pages which were not written */ -- cgit v1.2.3-70-g09d2 From 157f1071354db1aed885816094888e0e257c9d0a Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 11 Feb 2010 07:10:38 -0600 Subject: eCryptfs: Fix metadata in xattr feature regression Fixes regression in 8faece5f906725c10e7a1f6caf84452abadbdc7b When using the ecryptfs_xattr_metadata mount option, eCryptfs stores the metadata (normally stored at the front of the file) in the user.ecryptfs xattr. This causes ecryptfs_crypt_stat.num_header_bytes_at_front to be 0, since there is no header data at the front of the file. This results in too much memory being requested and ENOMEM being returned from ecryptfs_write_metadata(). This patch fixes the problem by using the num_header_bytes_at_front variable for specifying the max size of the metadata, despite whether it is stored in the header or xattr. Reviewed-by: Eric Sandeen Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 7 ++++--- fs/ecryptfs/ecryptfs_kernel.h | 8 ++++++++ fs/ecryptfs/inode.c | 2 +- fs/ecryptfs/mmap.c | 19 +++++-------------- 4 files changed, 18 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 7cb0a59f4b9..c907f6f4935 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -381,8 +381,8 @@ out: static void ecryptfs_lower_offset_for_extent(loff_t *offset, loff_t extent_num, struct ecryptfs_crypt_stat *crypt_stat) { - (*offset) = (crypt_stat->num_header_bytes_at_front - + (crypt_stat->extent_size * extent_num)); + (*offset) = ecryptfs_lower_header_size(crypt_stat) + + (crypt_stat->extent_size * extent_num); } /** @@ -834,7 +834,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) set_extent_mask_and_shift(crypt_stat); crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - crypt_stat->num_header_bytes_at_front = 0; + crypt_stat->num_header_bytes_at_front = + ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else { if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) crypt_stat->num_header_bytes_at_front = diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 542f625312f..8456f70606a 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -464,6 +464,14 @@ struct ecryptfs_daemon { extern struct mutex ecryptfs_daemon_hash_mux; +static inline size_t +ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) + return 0; + return crypt_stat->num_header_bytes_at_front; +} + static inline struct ecryptfs_file_info * ecryptfs_file_to_private(struct file *file) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 4a430ab4115..1a739531b0d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -768,7 +768,7 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, { loff_t lower_size; - lower_size = crypt_stat->num_header_bytes_at_front; + lower_size = ecryptfs_lower_header_size(crypt_stat); if (upper_size != 0) { loff_t num_extents; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index df4ce99d059..5a30e01547f 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -97,19 +97,6 @@ out: * (big-endian) * Octet 26: Begin RFC 2440 authentication token packet set */ -static void set_header_info(char *page_virt, - struct ecryptfs_crypt_stat *crypt_stat) -{ - size_t written; - size_t save_num_header_bytes_at_front = - crypt_stat->num_header_bytes_at_front; - - crypt_stat->num_header_bytes_at_front = - ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; - ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written); - crypt_stat->num_header_bytes_at_front = - save_num_header_bytes_at_front; -} /** * ecryptfs_copy_up_encrypted_with_header @@ -146,9 +133,13 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, memset(page_virt, 0, PAGE_CACHE_SIZE); /* TODO: Support more than one header extent */ if (view_extent_num == 0) { + size_t written; + rc = ecryptfs_read_xattr_region( page_virt, page->mapping->host); - set_header_info(page_virt, crypt_stat); + ecryptfs_write_header_metadata(page_virt + 20, + crypt_stat, + &written); } kunmap_atomic(page_virt, KM_USER0); flush_dcache_page(page); -- cgit v1.2.3-70-g09d2 From fa3ef1cb4e6e9958a9bfaa977c107c515907f102 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 11 Feb 2010 05:09:14 -0600 Subject: eCryptfs: Rename ecryptfs_crypt_stat.num_header_bytes_at_front This patch renames the num_header_bytes_at_front variable to metadata_size since it now contains the max size of the metadata. Reviewed-by: Eric Sandeen Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 24 ++++++++++-------------- fs/ecryptfs/ecryptfs_kernel.h | 4 ++-- fs/ecryptfs/inode.c | 2 +- fs/ecryptfs/mmap.c | 5 ++--- 4 files changed, 15 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index c907f6f4935..391f558eb4d 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -834,14 +834,13 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) set_extent_mask_and_shift(crypt_stat); crypt_stat->iv_bytes = ECRYPTFS_DEFAULT_IV_BYTES; if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - crypt_stat->num_header_bytes_at_front = - ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else { if (PAGE_CACHE_SIZE <= ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE) - crypt_stat->num_header_bytes_at_front = + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; else - crypt_stat->num_header_bytes_at_front = PAGE_CACHE_SIZE; + crypt_stat->metadata_size = PAGE_CACHE_SIZE; } } @@ -1238,8 +1237,7 @@ ecryptfs_write_header_metadata(char *virt, header_extent_size = (u32)crypt_stat->extent_size; num_header_extents_at_front = - (u16)(crypt_stat->num_header_bytes_at_front - / crypt_stat->extent_size); + (u16)(crypt_stat->metadata_size / crypt_stat->extent_size); put_unaligned_be32(header_extent_size, virt); virt += 4; put_unaligned_be16(num_header_extents_at_front, virt); @@ -1382,7 +1380,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -EINVAL; goto out; } - virt_len = crypt_stat->num_header_bytes_at_front; + virt_len = crypt_stat->metadata_size; order = get_order(virt_len); /* Released in this function */ virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); @@ -1428,16 +1426,15 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, header_extent_size = get_unaligned_be32(virt); virt += sizeof(__be32); num_header_extents_at_front = get_unaligned_be16(virt); - crypt_stat->num_header_bytes_at_front = - (((size_t)num_header_extents_at_front - * (size_t)header_extent_size)); + crypt_stat->metadata_size = (((size_t)num_header_extents_at_front + * (size_t)header_extent_size)); (*bytes_read) = (sizeof(__be32) + sizeof(__be16)); if ((validate_header_size == ECRYPTFS_VALIDATE_HEADER_SIZE) - && (crypt_stat->num_header_bytes_at_front + && (crypt_stat->metadata_size < ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE)) { rc = -EINVAL; printk(KERN_WARNING "Invalid header size: [%zd]\n", - crypt_stat->num_header_bytes_at_front); + crypt_stat->metadata_size); } return rc; } @@ -1452,8 +1449,7 @@ static int parse_header_metadata(struct ecryptfs_crypt_stat *crypt_stat, */ static void set_default_header_data(struct ecryptfs_crypt_stat *crypt_stat) { - crypt_stat->num_header_bytes_at_front = - ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; + crypt_stat->metadata_size = ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE; } /** diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 8456f70606a..d031efd7666 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -273,7 +273,7 @@ struct ecryptfs_crypt_stat { u32 flags; unsigned int file_version; size_t iv_bytes; - size_t num_header_bytes_at_front; + size_t metadata_size; size_t extent_size; /* Data extent size; default is 4096 */ size_t key_size; size_t extent_shift; @@ -469,7 +469,7 @@ ecryptfs_lower_header_size(struct ecryptfs_crypt_stat *crypt_stat) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) return 0; - return crypt_stat->num_header_bytes_at_front; + return crypt_stat->metadata_size; } static inline struct ecryptfs_file_info * diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 1a739531b0d..a50efb18701 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -335,7 +335,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - file_size = (crypt_stat->num_header_bytes_at_front + file_size = (crypt_stat->metadata_size + i_size_read(lower_dentry->d_inode)); else file_size = i_size_read(lower_dentry->d_inode); diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 5a30e01547f..270f42ae7c0 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -122,8 +122,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, * num_extents_per_page) + extent_num_in_page); size_t num_header_extents_at_front = - (crypt_stat->num_header_bytes_at_front - / crypt_stat->extent_size); + (crypt_stat->metadata_size / crypt_stat->extent_size); if (view_extent_num < num_header_extents_at_front) { /* This is a header extent */ @@ -152,7 +151,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, /* This is an encrypted data extent */ loff_t lower_offset = ((view_extent_num * crypt_stat->extent_size) - - crypt_stat->num_header_bytes_at_front); + - crypt_stat->metadata_size); rc = ecryptfs_read_lower_page_segment( page, (lower_offset >> PAGE_CACHE_SHIFT), -- cgit v1.2.3-70-g09d2 From 1984c23f9e0cdb432d90a85ecf88b424d36878fc Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Wed, 10 Feb 2010 23:17:44 -0600 Subject: eCryptfs: Clear buffer before reading in metadata xattr We initially read in the first PAGE_CACHE_SIZE of a file to if the eCryptfs header marker can be found. If it isn't found and ecryptfs_xattr_metadata was given as a mount option, then the user.ecryptfs xattr is read into the same buffer. Since the data from the first page of the file wasn't cleared, it is possible that we think we've found a second tag 3 or tag 1 packet and then error out after the packet contents aren't as expected. This patch clears the buffer before filling it with metadata from the user.ecryptfs xattr. Reviewed-by: Eric Sandeen Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 1 + fs/ecryptfs/inode.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 391f558eb4d..4d9db0ed88e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1603,6 +1603,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) ecryptfs_dentry, ECRYPTFS_VALIDATE_HEADER_SIZE); if (rc) { + memset(page_virt, 0, PAGE_CACHE_SIZE); rc = ecryptfs_read_xattr_region(page_virt, ecryptfs_inode); if (rc) { printk(KERN_DEBUG "Valid eCryptfs headers not found in " diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a50efb18701..ddbd096c740 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -323,6 +323,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, rc = ecryptfs_read_and_validate_header_region(page_virt, ecryptfs_dentry->d_inode); if (rc) { + memset(page_virt, 0, PAGE_CACHE_SIZE); rc = ecryptfs_read_and_validate_xattr_region(page_virt, ecryptfs_dentry); if (rc) { -- cgit v1.2.3-70-g09d2 From f4e60e6b303bc46cdc477d3174dbf9cb5dd013aa Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 11 Feb 2010 00:02:32 -0600 Subject: eCryptfs: Strip metadata in xattr flag in encrypted view The ecryptfs_encrypted_view mount option provides a unified way of viewing encrypted eCryptfs files. If the metadata is stored in a xattr, the metadata is moved to the file header when the file is read inside the eCryptfs mount. Because of this, we should strip the ECRYPTFS_METADATA_IN_XATTR flag from the header's flag section. This allows eCryptfs to treat the file as an eCryptfs file with a header at the front. Reviewed-by: Eric Sandeen Signed-off-by: Tyler Hicks --- fs/ecryptfs/crypto.c | 9 +++++---- fs/ecryptfs/ecryptfs_kernel.h | 3 +++ fs/ecryptfs/mmap.c | 14 ++++++++++++++ 3 files changed, 22 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 4d9db0ed88e..fad5bf6a611 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1107,9 +1107,9 @@ static void write_ecryptfs_marker(char *page_virt, size_t *written) (*written) = MAGIC_ECRYPTFS_MARKER_SIZE_BYTES; } -static void -write_ecryptfs_flags(char *page_virt, struct ecryptfs_crypt_stat *crypt_stat, - size_t *written) +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written) { u32 flags = 0; int i; @@ -1290,7 +1290,8 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, offset = ECRYPTFS_FILE_SIZE_BYTES; write_ecryptfs_marker((page_virt + offset), &written); offset += written; - write_ecryptfs_flags((page_virt + offset), crypt_stat, &written); + ecryptfs_write_crypt_stat_flags((page_virt + offset), crypt_stat, + &written); offset += written; ecryptfs_write_header_metadata((page_virt + offset), crypt_stat, &written); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index d031efd7666..bc7115403f3 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -659,6 +659,9 @@ int ecryptfs_decrypt_page(struct page *page); int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry); int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry); int ecryptfs_new_file_context(struct dentry *ecryptfs_dentry); +void ecryptfs_write_crypt_stat_flags(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat, + size_t *written); int ecryptfs_read_and_validate_header_region(char *data, struct inode *ecryptfs_inode); int ecryptfs_read_and_validate_xattr_region(char *page_virt, diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 270f42ae7c0..bea998a25af 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -82,6 +82,19 @@ out: return rc; } +static void strip_xattr_flag(char *page_virt, + struct ecryptfs_crypt_stat *crypt_stat) +{ + if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) { + size_t written; + + crypt_stat->flags &= ~ECRYPTFS_METADATA_IN_XATTR; + ecryptfs_write_crypt_stat_flags(page_virt, crypt_stat, + &written); + crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; + } +} + /** * Header Extent: * Octets 0-7: Unencrypted file size (big-endian) @@ -136,6 +149,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, rc = ecryptfs_read_xattr_region( page_virt, page->mapping->host); + strip_xattr_flag(page_virt + 16, crypt_stat); ecryptfs_write_header_metadata(page_virt + 20, crypt_stat, &written); -- cgit v1.2.3-70-g09d2 From 810627a013163cd294762d57c0ea2ec055ffe4f6 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 27 Mar 2010 02:00:49 +0000 Subject: [CIFS] Add mmap for direct, nobrl cifs mount types without mmap functions in file_ops OpenOffice can't save changes in existing document. The same situation you can see with gedit. Also, a.out format of files can't be executed without mmap. Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5183bc2a191..ded66be6597 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -808,6 +808,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .release = cifs_close, .fsync = cifs_fsync, .flush = cifs_flush, + .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, #ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, -- cgit v1.2.3-70-g09d2 From e05c378f4973674a16d5b9636f2310cf88aca5f2 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 30 Mar 2010 18:25:17 +0200 Subject: [LogFS] Remove unused method All callers are long gone. Signed-off-by: Joern Engel --- fs/logfs/logfs.h | 1 - fs/logfs/readwrite.c | 21 --------------------- fs/logfs/segment.c | 6 ------ 3 files changed, 28 deletions(-) (limited to 'fs') diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index b84b0eec602..97195b9e93a 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -305,7 +305,6 @@ typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix, level_t level, int child_no, __be64 val); struct logfs_block_ops { void (*write_block)(struct logfs_block *block); - gc_level_t (*block_level)(struct logfs_block *block); void (*free_block)(struct super_block *sb, struct logfs_block*block); int (*write_alias)(struct super_block *sb, struct logfs_block *block, diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index c3a3a6814b8..3659c37fbd7 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -429,25 +429,6 @@ static void inode_write_block(struct logfs_block *block) } } -static gc_level_t inode_block_level(struct logfs_block *block) -{ - BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER); - return GC_LEVEL(LOGFS_MAX_LEVELS); -} - -static gc_level_t indirect_block_level(struct logfs_block *block) -{ - struct page *page; - struct inode *inode; - u64 bix; - level_t level; - - page = block->page; - inode = page->mapping->host; - logfs_unpack_index(page->index, &bix, &level); - return expand_level(inode->i_ino, level); -} - /* * This silences a false, yet annoying gcc warning. I hate it when my editor * jumps into bitops.h each time I recompile this file. @@ -586,14 +567,12 @@ static void indirect_free_block(struct super_block *sb, static struct logfs_block_ops inode_block_ops = { .write_block = inode_write_block, - .block_level = inode_block_level, .free_block = inode_free_block, .write_alias = inode_write_alias, }; struct logfs_block_ops indirect_block_ops = { .write_block = indirect_write_block, - .block_level = indirect_block_level, .free_block = indirect_free_block, .write_alias = indirect_write_alias, }; diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 0ecd8f07c11..02db22ebbf1 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -182,14 +182,8 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block, return 0; } -static gc_level_t btree_block_level(struct logfs_block *block) -{ - return expand_level(block->ino, block->level); -} - static struct logfs_block_ops btree_block_ops = { .write_block = btree_write_block, - .block_level = btree_block_level, .free_block = __free_block, .write_alias = btree_write_alias, }; -- cgit v1.2.3-70-g09d2 From efd647f744f3cf504ed83580274bd4b6918139fe Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 30 Mar 2010 13:21:31 +0800 Subject: ocfs2_dlmfs: User DLM_* when decoding file open flags. In commit 0016eedc4185a3cd7e578b027a6e69001b85d6c4, we have changed dlmfs to use stackglue. So when use DLM* when we decode dlm flags from open level. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/dlmfs/dlmfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 1b0de157a08..a99d1eafa8e 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -112,20 +112,20 @@ MODULE_PARM_DESC(capabilities, DLMFS_CAPABILITIES); * O_RDONLY -> PRMODE level * O_WRONLY -> EXMODE level * - * O_NONBLOCK -> LKM_NOQUEUE + * O_NONBLOCK -> NOQUEUE */ static int dlmfs_decode_open_flags(int open_flags, int *level, int *flags) { if (open_flags & (O_WRONLY|O_RDWR)) - *level = LKM_EXMODE; + *level = DLM_LOCK_EX; else - *level = LKM_PRMODE; + *level = DLM_LOCK_PR; *flags = 0; if (open_flags & O_NONBLOCK) - *flags |= LKM_NOQUEUE; + *flags |= DLM_LKF_NOQUEUE; return 0; } @@ -166,7 +166,7 @@ static int dlmfs_file_open(struct inode *inode, * to be able userspace to be able to distinguish a * valid lock request from one that simply couldn't be * granted. */ - if (flags & LKM_NOQUEUE && status == -EAGAIN) + if (flags & DLM_LKF_NOQUEUE && status == -EAGAIN) status = -ETXTBSY; kfree(fp); goto bail; @@ -193,7 +193,7 @@ static int dlmfs_file_release(struct inode *inode, status = 0; if (fp) { level = fp->fp_lock_level; - if (level != LKM_IVMODE) + if (level != DLM_LOCK_IV) user_dlm_cluster_unlock(&ip->ip_lockres, level); kfree(fp); -- cgit v1.2.3-70-g09d2 From a03ab788d070c256eff1ac24cf0e3bc2ca148096 Mon Sep 17 00:00:00 2001 From: Coly Li Date: Fri, 26 Mar 2010 05:15:12 +0800 Subject: ocfs2: one more warning fix in ocfs2_file_aio_write(), v2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes another compiling warning in ocfs2_file_aio_write() like this, fs/ocfs2/file.c: In function ‘ocfs2_file_aio_write’: fs/ocfs2/file.c:2026: warning: suggest parentheses around ‘&&’ within ‘||’ As Joel suggested, '!ret' is unary, this version removes the wrap from '!ret'. Signed-off-by: Coly Li Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 17947dc8341..2b4235c5831 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -2021,9 +2021,9 @@ out_dio: if (ret < 0) written = ret; - if (!ret && (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters || - has_refcount)) { + if (!ret && ((old_size != i_size_read(inode)) || + (old_clusters != OCFS2_I(inode)->ip_clusters) || + has_refcount)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; -- cgit v1.2.3-70-g09d2 From 428257f8870f0e72e85ce782d091fa1f366de7df Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Wed, 24 Mar 2010 22:40:44 +0800 Subject: ocfs2: Check the owner of a lockres inside the spinlock The checking of lockres owner in dlm_update_lvb() is not inside spinlock protection. I don't see problem in current call path of dlm_update_lvb(). But just for code robustness. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmast.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index dccc439fa08..b7a25ef18e2 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -185,9 +185,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, BUG_ON(!lksb); /* only updates if this node masters the lockres */ + spin_lock(&res->spinlock); if (res->owner == dlm->node_num) { - - spin_lock(&res->spinlock); /* check the lksb flags for the direction */ if (lksb->flags & DLM_LKSB_GET_LVB) { mlog(0, "getting lvb from lockres for %s node\n", @@ -202,8 +201,8 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * here. In the future we might want to clear it at the time * the put is actually done. */ - spin_unlock(&res->spinlock); } + spin_unlock(&res->spinlock); /* reset any lvb flags on the lksb */ lksb->flags &= ~(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB); -- cgit v1.2.3-70-g09d2 From 9358c6d4c0264b1572554c49c4b92673ea9a5c72 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 30 Mar 2010 13:54:41 -0700 Subject: ceph: fix dentry rehashing on virtual .snap dir If a lookup fails on the magic .snap directory, we bind it to a magic snap directory inode in ceph_lookup_finish(). That code assumes the dentry is unhashed, but a recent server-side change started returning NULL leases on lookup failure, causing the .snap dentry to be hashed and NULL by ceph_fill_trace(). This causes dentry hash chain corruption, or a dies when d_rehash() includes BUG_ON(!d_unhashed(entry)); So, avoid processing the NULL dentry lease if it the dentry matches the snapdir name in ceph_fill_trace(). That allows the lookup completion to properly bind it to the snapdir inode. BUG there if dentry is hashed to be sure. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 1 + fs/ceph/inode.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 8a9116e15b7..aed8fda3302 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -488,6 +488,7 @@ struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct inode *inode = ceph_get_snapdir(parent); dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n", dentry, dentry->d_name.len, dentry->d_name.name, inode); + BUG_ON(!d_unhashed(dentry)); d_add(dentry, inode); err = 0; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index aca82d55cc5..26f883c275e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -886,6 +886,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, struct inode *in = NULL; struct ceph_mds_reply_inode *ininfo; struct ceph_vino vino; + struct ceph_client *client = ceph_sb_to_client(sb); int i = 0; int err = 0; @@ -949,7 +950,14 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, return err; } - if (rinfo->head->is_dentry && !req->r_aborted) { + /* + * ignore null lease/binding on snapdir ENOENT, or else we + * will have trouble splicing in the virtual snapdir later + */ + if (rinfo->head->is_dentry && !req->r_aborted && + (rinfo->head->is_target || strncmp(req->r_dentry->d_name.name, + client->mount_args->snapdir_name, + req->r_dentry->d_name.len))) { /* * lookup link rename : null -> possibly existing inode * mknod symlink mkdir : null -> new inode -- cgit v1.2.3-70-g09d2 From 753234007f4ac2c96921cfb19ec1ba535ac29790 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Wed, 31 Mar 2010 15:41:00 +0800 Subject: nilfs2: fix a wrong type conversion in nilfs_ioctl() (void * __user *) should be (void __user *) Signed-off-by: Li Hong Signed-off-by: Ryusuke Konishi --- fs/nilfs2/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 313d0a21da4..c446017141d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -648,7 +648,7 @@ static int nilfs_ioctl_get_info(struct inode *inode, struct file *filp, long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; - void __user *argp = (void * __user *)arg; + void __user *argp = (void __user *)arg; switch (cmd) { case NILFS_IOCTL_CHANGE_CPMODE: -- cgit v1.2.3-70-g09d2 From a42ab8e1a37257da37e0f018e707bf365ac24531 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Wed, 31 Mar 2010 18:25:44 -0700 Subject: ocfs2: Compute metaecc for superblocks during online resize. Online resize writes out the new superblock and its backups directly. The metaecc data wasn't being recomputed. Let's do that directly. Signed-off-by: Joel Becker Acked-by: Mark Fasheh [ Cc: stable@kernel.org --- fs/ocfs2/buffer_head_io.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 21c808f752d..b18c6d677f9 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -407,6 +407,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh) { int ret = 0; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; mlog_entry_void(); @@ -426,6 +427,7 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &di->i_check); submit_bh(WRITE, bh); wait_on_buffer(bh); -- cgit v1.2.3-70-g09d2 From 80e755fedebc8de0599a79efad2c656503df2e62 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 31 Mar 2010 21:52:10 -0700 Subject: ceph: allow writeback of snapped pages older than 'oldest' snapc On snap deletion, we don't regenerate ceph_cap_snaps for inodes with dirty pages because deletion does not affect metadata writeback. However, we did run into problems when we went to write back the pages because the 'oldest' snapc is determined by the oldest cap_snap, and that may be the newer snapc that reflects the deletion. This caused confusion and an infinite loop in ceph_update_writeable_page(). Change the snapc checks to allow writeback of any snapc that is equal to OR older than the 'oldest' snapc. When there are no cap_snaps, we were also using the realm's latest snapc for writeback, which complicates ceph_put_wrbufffer_cap_refs(). Instead, use i_head_snapc, the most snapc used for the most recent ('head') data. This makes the writeback snapc (ceph_osd_request.r_snapc) _always_ match a capsnap or i_head_snapc. Also, in writepags_finish(), drop the snapc referenced by the _page_ and do not assume it matches the request snapc (it may not anymore). Signed-off-by: Sage Weil --- fs/ceph/addr.c | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index ce8ef610772..a313e9baeed 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -356,8 +356,8 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode, break; } } - if (!snapc && ci->i_snap_realm) { - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); + if (!snapc && ci->i_head_snapc) { + snapc = ceph_get_snap_context(ci->i_head_snapc); dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } @@ -412,7 +412,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p not dirty?\n", inode, page); goto out; } - if (snapc != get_oldest_context(inode, &snap_size)) { + if (snapc->seq > get_oldest_context(inode, &snap_size)->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, (void *)page->private); /* we should only noop if called by kswapd */ @@ -557,9 +557,9 @@ static void writepages_finish(struct ceph_osd_request *req, dout("inode %p skipping page %p\n", inode, page); wbc->pages_skipped++; } + ceph_put_snap_context((void *)page->private); page->private = 0; ClearPagePrivate(page); - ceph_put_snap_context(snapc); dout("unlocking %d %p\n", i, page); end_page_writeback(page); @@ -617,7 +617,7 @@ static int ceph_writepages_start(struct address_space *mapping, int range_whole = 0; int should_loop = 1; pgoff_t max_pages = 0, max_pages_ever = 0; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL; + struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; struct pagevec pvec; int done = 0; int rc = 0; @@ -769,9 +769,10 @@ get_more_pages: } /* only if matching snap context */ - if (snapc != (void *)page->private) { - dout("page snapc %p != oldest %p\n", - (void *)page->private, snapc); + pgsnapc = (void *)page->private; + if (pgsnapc->seq > snapc->seq) { + dout("page snapc %p %lld > oldest %p %lld\n", + pgsnapc, pgsnapc->seq, snapc, snapc->seq); unlock_page(page); if (!locked_pages) continue; /* keep looking for snap */ @@ -935,8 +936,8 @@ static int ceph_update_writeable_page(struct file *file, int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; loff_t i_size; - struct ceph_snap_context *snapc; int r; + struct ceph_snap_context *snapc, *oldest; retry_locked: /* writepages currently holds page lock, but if we change that later, */ @@ -946,16 +947,16 @@ retry_locked: BUG_ON(!ci->i_snap_realm); down_read(&mdsc->snap_rwsem); BUG_ON(!ci->i_snap_realm->cached_context); - if (page->private && - (void *)page->private != ci->i_snap_realm->cached_context) { + snapc = (void *)page->private; + if (snapc && snapc != ci->i_head_snapc) { /* * this page is already dirty in another (older) snap * context! is it writeable now? */ - snapc = get_oldest_context(inode, NULL); + oldest = get_oldest_context(inode, NULL); up_read(&mdsc->snap_rwsem); - if (snapc != (void *)page->private) { + if (snapc->seq > oldest->seq) { dout(" page %p snapc %p not current or oldest\n", page, (void *)page->private); /* -- cgit v1.2.3-70-g09d2 From 6298a33757ba7361bb8f506c106daad77e5ac8cf Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 31 Mar 2010 22:01:38 -0700 Subject: ceph: fix snap context reference leaks The get_oldest_context() helper takes a reference to the returned snap context, but most callers weren't dropping that reference. Fix them. Also drop the unused locked __get_oldest_context() variant. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index a313e9baeed..41f1f713b7b 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -336,16 +336,15 @@ out: /* * Get ref for the oldest snapc for an inode with dirty data... that is, the * only snap context we are allowed to write back. - * - * Caller holds i_lock. */ -static struct ceph_snap_context *__get_oldest_context(struct inode *inode, - u64 *snap_size) +static struct ceph_snap_context *get_oldest_context(struct inode *inode, + u64 *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; struct ceph_cap_snap *capsnap = NULL; + spin_lock(&inode->i_lock); list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap, capsnap->context, capsnap->dirty_pages); @@ -361,16 +360,6 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode, dout(" head snapc %p has %d dirty pages\n", snapc, ci->i_wrbuffer_ref_head); } - return snapc; -} - -static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) -{ - struct ceph_snap_context *snapc = NULL; - - spin_lock(&inode->i_lock); - snapc = __get_oldest_context(inode, snap_size); spin_unlock(&inode->i_lock); return snapc; } @@ -391,7 +380,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) int len = PAGE_CACHE_SIZE; loff_t i_size; int err = 0; - struct ceph_snap_context *snapc; + struct ceph_snap_context *snapc, *oldest; u64 snap_size = 0; long writeback_stat; @@ -412,13 +401,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage %p page %p not dirty?\n", inode, page); goto out; } - if (snapc->seq > get_oldest_context(inode, &snap_size)->seq) { + oldest = get_oldest_context(inode, &snap_size); + if (snapc->seq > oldest->seq) { dout("writepage %p page %p snapc %p not writeable - noop\n", inode, page, (void *)page->private); /* we should only noop if called by kswapd */ WARN_ON((current->flags & PF_MEMALLOC) == 0); + ceph_put_snap_context(oldest); goto out; } + ceph_put_snap_context(oldest); /* is this a partial page at end of file? */ if (snap_size) @@ -457,7 +449,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ClearPagePrivate(page); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); - ceph_put_snap_context(snapc); + ceph_put_snap_context(snapc); /* page's reference */ out: return err; } @@ -914,7 +906,10 @@ static int context_is_writeable_or_written(struct inode *inode, struct ceph_snap_context *snapc) { struct ceph_snap_context *oldest = get_oldest_context(inode, NULL); - return !oldest || snapc->seq <= oldest->seq; + int ret = !oldest || snapc->seq <= oldest->seq; + + ceph_put_snap_context(oldest); + return ret; } /* @@ -957,13 +952,14 @@ retry_locked: up_read(&mdsc->snap_rwsem); if (snapc->seq > oldest->seq) { + ceph_put_snap_context(oldest); dout(" page %p snapc %p not current or oldest\n", - page, (void *)page->private); + page, snapc); /* * queue for writeback, and wait for snapc to * be writeable or written */ - snapc = ceph_get_snap_context((void *)page->private); + snapc = ceph_get_snap_context(snapc); unlock_page(page); ceph_queue_writeback(inode); r = wait_event_interruptible(ci->i_cap_wq, @@ -973,6 +969,7 @@ retry_locked: return r; return -EAGAIN; } + ceph_put_snap_context(oldest); /* yay, writeable, do it now (without dropping page lock) */ dout(" page %p snapc %p not current, but oldest\n", -- cgit v1.2.3-70-g09d2 From 819ccbfa448403992ceafc05d6d7097aaa74d4c3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 1 Apr 2010 09:33:46 -0700 Subject: ceph: fix leaked inode ref due to snap metadata writeback race We create a ceph_cap_snap if there is dirty cap metadata (for writeback to mds) OR dirty pages (for writeback to osd). It is thus possible that the metadata has been written back to the MDS but the OSD data has not when the cap_snap is created. This results in a cap_snap with dirty(caps) == 0. The problem is that cap writeback to the MDS isn't necessary, and a FLUSHSNAP cap op gets no ack from the MDS. This leaves the cap_snap attached to the inode along with its inode reference. Fix the problem by dropping the cap_snap if it becomes 'complete' (all pages written out) and dirty(caps) == 0 in ceph_put_wrbuffer_cap_refs(). Also, BUG() in __ceph_flush_snaps() if we encounter a cap_snap with dirty(caps) == 0. Signed-off-by: Sage Weil --- fs/ceph/caps.c | 42 ++++++++++++++++++++++++++++++++---------- fs/ceph/snap.c | 10 ++++++---- 2 files changed, 38 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 7d0a0d0adc1..b6fdf010749 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1204,6 +1204,12 @@ retry: if (capsnap->dirty_pages || capsnap->writing) continue; + /* + * if cap writeback already occurred, we should have dropped + * the capsnap in ceph_put_wrbuffer_cap_refs. + */ + BUG_ON(capsnap->dirty == 0); + /* pick mds, take s_mutex */ mds = __ceph_get_cap_mds(ci, &mseq); if (session && session->s_mds != mds) { @@ -2117,8 +2123,8 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) } spin_unlock(&inode->i_lock); - dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had), - last ? "last" : ""); + dout("put_cap_refs %p had %s%s%s\n", inode, ceph_cap_string(had), + last ? " last" : "", put ? " put" : ""); if (last && !flushsnaps) ceph_check_caps(ci, 0, NULL); @@ -2142,7 +2148,8 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, { struct inode *inode = &ci->vfs_inode; int last = 0; - int last_snap = 0; + int complete_capsnap = 0; + int drop_capsnap = 0; int found = 0; struct ceph_cap_snap *capsnap = NULL; @@ -2165,19 +2172,32 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) { if (capsnap->context == snapc) { found = 1; - capsnap->dirty_pages -= nr; - last_snap = !capsnap->dirty_pages; break; } } BUG_ON(!found); + capsnap->dirty_pages -= nr; + if (capsnap->dirty_pages == 0) { + complete_capsnap = 1; + if (capsnap->dirty == 0) + /* cap writeback completed before we created + * the cap_snap; no FLUSHSNAP is needed */ + drop_capsnap = 1; + } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s\n", + " snap %lld %d/%d -> %d/%d %s%s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - last_snap ? " (capsnap last)" : ""); + complete_capsnap ? " (complete capsnap)" : "", + drop_capsnap ? " (drop capsnap)" : ""); + if (drop_capsnap) { + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + } } spin_unlock(&inode->i_lock); @@ -2185,10 +2205,12 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (last) { ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); iput(inode); - } else if (last_snap) { + } else if (complete_capsnap) { ceph_flush_snaps(ci); wake_up(&ci->i_cap_wq); } + if (drop_capsnap) + iput(inode); } /* @@ -2464,8 +2486,8 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, break; } WARN_ON(capsnap->dirty_pages || capsnap->writing); - dout(" removing cap_snap %p follows %lld\n", - capsnap, follows); + dout(" removing %p cap_snap %p follows %lld\n", + inode, capsnap, follows); ceph_put_snap_context(capsnap->context); list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index df04e210a05..7e3e5f9edaa 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -521,15 +521,17 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, capsnap->ctime = inode->i_ctime; capsnap->time_warp_seq = ci->i_time_warp_seq; if (capsnap->dirty_pages) { - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu " + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu " "still has %d dirty pages\n", inode, capsnap, capsnap->context, capsnap->context->seq, - capsnap->size, capsnap->dirty_pages); + ceph_cap_string(capsnap->dirty), capsnap->size, + capsnap->dirty_pages); return 0; } - dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n", + dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu\n", inode, capsnap, capsnap->context, - capsnap->context->seq, capsnap->size); + capsnap->context->seq, ceph_cap_string(capsnap->dirty), + capsnap->size); spin_lock(&mdsc->snap_flush_lock); list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list); -- cgit v1.2.3-70-g09d2 From 308f44193f796b1c522b3b87760e43d8d8e316d2 Mon Sep 17 00:00:00 2001 From: Li Hong Date: Fri, 2 Apr 2010 18:40:39 +0800 Subject: nilfs2: Remove an uninitialization warning in nilfs_btree_propagate_v() `make CONFIG_NILFS2_FS=m M=fs/nilfs2/` will give the following warnings: fs/nilfs2/btree.c: In function 'nilfs_btree_propagate': fs/nilfs2/btree.c:1882: warning: 'maxlevel' may be used uninitialized in this function fs/nilfs2/btree.c:1882: note: 'maxlevel' was declared here Set maxlevel = 0 to fix it. Signed-off-by: Li Hong Signed-off-by: Ryusuke Konishi --- fs/nilfs2/btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 7cdd98b8d51..76c38e3e19d 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -1879,7 +1879,7 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) { - int maxlevel, ret; + int maxlevel = 0, ret; struct nilfs_btree_node *parent; struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); __u64 ptr; -- cgit v1.2.3-70-g09d2 From 0e0d5e0c4bb0476d53a43bfc87d03a25ec4b5579 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 2 Apr 2010 16:07:19 -0700 Subject: ceph: fix ack counter reset on connection reset If in_seq_acked isn't reset along with in_seq, we don't ack received messages until we reach the old count, consuming gobs memory on the other end of the connection and introducing a large delay when those messages are eventually deleted. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index a32f0f896d9..f35b4945a9c 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -332,6 +332,7 @@ static void reset_connection(struct ceph_connection *con) con->out_msg = NULL; } con->in_seq = 0; + con->in_seq_acked = 0; } /* -- cgit v1.2.3-70-g09d2 From a24e2d7d8f512340991ef0a59cb5d08d491b8e98 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sat, 3 Apr 2010 17:20:21 +0000 Subject: [CIFS] initialize nbytes at the beginning of CIFSSMBWrite() By doing this we always overwrite nbytes value that is being passed on to CIFSSMBWrite() and need not rely on the callers to initialize. CIFSSMBWrite2 is doing this already. CC: Stable Reviewed-by: Shirish Pargaonkar Reviewed-by: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 7cc7f83e931..e1f90a3a016 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1430,6 +1430,8 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon, __u32 bytes_sent; __u16 byte_count; + *nbytes = 0; + /* cFYI(1, ("write at %lld %d bytes", offset, count));*/ if (tcon->ses == NULL) return -ECONNABORTED; @@ -1512,7 +1514,6 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon, cifs_stats_inc(&tcon->num_writes); if (rc) { cFYI(1, ("Send error in write = %d", rc)); - *nbytes = 0; } else { *nbytes = le16_to_cpu(pSMBr->CountHigh); *nbytes = (*nbytes) << 16; -- cgit v1.2.3-70-g09d2 From 6513a81e9325d712f1bfb9a1d7b750134e49ff18 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Wed, 31 Mar 2010 12:00:03 +0530 Subject: cifs: Fix a kernel BUG with remote OS/2 server (try #3) While chasing a bug report involving a OS/2 server, I noticed the server sets pSMBr->CountHigh to a incorrect value even in case of normal writes. This results in 'nbytes' being computed wrongly and triggers a kernel BUG at mm/filemap.c. void iov_iter_advance(struct iov_iter *i, size_t bytes) { BUG_ON(i->count < bytes); <--- BUG here Why the server is setting 'CountHigh' is not clear but only does so after writing 64k bytes. Though this looks like the server bug, the client side crash may not be acceptable. The workaround is to mask off high 16 bits if the number of bytes written as returned by the server is greater than the bytes requested by the client as suggested by Jeff Layton. CC: Stable Reviewed-by: Jeff Layton Signed-off-by: Suresh Jayaraman Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index e1f90a3a016..f213b8ae43c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1518,6 +1518,14 @@ CIFSSMBWrite(const int xid, struct cifsTconInfo *tcon, *nbytes = le16_to_cpu(pSMBr->CountHigh); *nbytes = (*nbytes) << 16; *nbytes += le16_to_cpu(pSMBr->Count); + + /* + * Mask off high 16 bits when bytes written as returned by the + * server is greater than bytes requested by the client. Some + * OS/2 servers are known to set incorrect CountHigh values. + */ + if (*nbytes > count) + *nbytes &= 0xFFFF; } cifs_buf_release(pSMB); @@ -1606,6 +1614,14 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, *nbytes = le16_to_cpu(pSMBr->CountHigh); *nbytes = (*nbytes) << 16; *nbytes += le16_to_cpu(pSMBr->Count); + + /* + * Mask off high 16 bits when bytes written as returned by the + * server is greater than bytes requested by the client. OS/2 + * servers are known to set incorrect CountHigh values. + */ + if (*nbytes > count) + *nbytes &= 0xFFFF; } /* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ -- cgit v1.2.3-70-g09d2 From 8b472d739b2ddd8ab7fb278874f696cd95b25a5e Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Sat, 3 Apr 2010 16:45:06 -0400 Subject: ext4: Fix possible lost inode write in no journal mode In the no-journal case, ext4_write_inode() will fetch the bh and call sync_dirty_buffer() on it. However, if the bh has already been written and the bh reclaimed for some other purpose, AND if the inode is the only one in the inode table block in use, then ext4_get_inode_loc() will not read the inode table block from disk, but as an optimization, fill the block with zero's assuming that its caller will copy in the on-disk version of the inode. This is not done by ext4_write_inode(), so the contents of the inode can simply get lost. The fix is to use __ext4_get_inode_loc() with in_mem set to 0, instead of ext4_get_inode_loc(). Long term the API needs to be fixed so it's obvious why latter is not safe. Addresses-Google-Bug: #2526446 Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 11119e07233..87e3c70d069 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5374,7 +5374,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) } else { struct ext4_iloc iloc; - err = ext4_get_inode_loc(inode, &iloc); + err = __ext4_get_inode_loc(inode, &iloc, 0); if (err) return err; if (wbc->sync_mode == WB_SYNC_ALL) -- cgit v1.2.3-70-g09d2 From fd2dd9fbaf9e498ec63eef298921e36556f7214c Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Sat, 3 Apr 2010 17:44:16 -0400 Subject: ext4: Fix buffer head leaks after calls to ext4_get_inode_loc() Calls to ext4_get_inode_loc() returns with a reference to a buffer head in iloc->bh. The callers of this function in ext4_write_inode() when in no journal mode and in ext4_xattr_fiemap() don't release the buffer head after using it. Addresses-Google-Bug: #2548165 Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 94c8ee81f5e..236b834b4ca 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3879,6 +3879,7 @@ static int ext4_xattr_fiemap(struct inode *inode, physical += offset; length = EXT4_SB(inode->i_sb)->s_inode_size - offset; flags |= FIEMAP_EXTENT_DATA_INLINE; + brelse(iloc.bh); } else { /* external block */ physical = EXT4_I(inode)->i_file_acl << blockbits; length = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 87e3c70d069..ba1eee847e3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5385,6 +5385,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) (unsigned long long)iloc.bh->b_blocknr); err = -EIO; } + brelse(iloc.bh); } return err; } -- cgit v1.2.3-70-g09d2 From ab6e24103cbd215e922938a4f58c75194761a60e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 19 Mar 2010 14:38:13 +0000 Subject: Btrfs: fix data enospc check overflow Because we account for reserved space we get from the allocator before we actually account for allocating delalloc space, we can have a small window where the amount of "used" space in a space_info is more than the total amount of space in the space_info. This will cause a overflow in our check, so it will seem like we have _tons_ of free space, and we'll allow reservations to occur that will end up larger than the amount of space we have. I've seen users report ENOSPC panic's in cow_file_range a few times recently, so I tried to reproduce this problem and found I could reproduce it if I ran one of my tests in a loop for like 20 minutes. With this patch my test ran all night without issues. Thanks, Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 101041d4d2b..786899da3eb 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3234,7 +3234,8 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; - int ret = 0, committed = 0; + u64 used; + int ret = 0, committed = 0, flushed = 0; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); @@ -3246,12 +3247,21 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, again: /* make sure we have enough space to handle the data first */ spin_lock(&data_sinfo->lock); - if (data_sinfo->total_bytes - data_sinfo->bytes_used - - data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - - data_sinfo->bytes_may_use - data_sinfo->bytes_super < bytes) { + used = data_sinfo->bytes_used + data_sinfo->bytes_delalloc + + data_sinfo->bytes_reserved + data_sinfo->bytes_pinned + + data_sinfo->bytes_readonly + data_sinfo->bytes_may_use + + data_sinfo->bytes_super; + + if (used + bytes > data_sinfo->total_bytes) { struct btrfs_trans_handle *trans; + if (!flushed) { + spin_unlock(&data_sinfo->lock); + flush_delalloc(root, data_sinfo); + flushed = 1; + goto again; + } + /* * if we don't have enough free bytes in this space then we need * to alloc a new chunk. -- cgit v1.2.3-70-g09d2 From 9f680ce04ea19dabbbafe01b57b61930a9b70741 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 6 Apr 2010 09:37:47 -0400 Subject: Btrfs: make sure the chunk allocator doesn't create zero length chunks A recent commit allowed for smaller chunks to be created, but didn't make sure they were always bigger than a stripe. After some divides, this led to zero length stripes. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9bf1f581b87..b584e9a2add 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2249,6 +2249,12 @@ again: if (!looped) calc_size = max_t(u64, min_stripe_size, calc_size); + /* + * we're about to do_div by the stripe_len so lets make sure + * we end up with something bigger than a stripe + */ + calc_size = max_t(u64, calc_size, stripe_len * 4); + do_div(calc_size, stripe_len); calc_size *= stripe_len; -- cgit v1.2.3-70-g09d2 From f05337c6ac48d19d354e0640a8eb8fc884f82bcc Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Mon, 5 Apr 2010 09:59:14 +0400 Subject: not overwriting file_lock structure after GET_LK If we have preventing lock, cifs should overwrite file_lock structure with info about preventing lock. If we haven't preventing lock, cifs should leave it unchanged except for the lock type (change it to F_UNLCK). Signed-off-by: Pavel Shilovsky Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifssmb.c | 15 ++++++++++++++- fs/cifs/file.c | 28 ++++++++++++++++++++++++++-- 2 files changed, 40 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f213b8ae43c..184a399749a 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1810,8 +1810,21 @@ CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, } parm_data = (struct cifs_posix_lock *) ((char *)&pSMBr->hdr.Protocol + data_offset); - if (parm_data->lock_type == cpu_to_le16(CIFS_UNLCK)) + if (parm_data->lock_type == __constant_cpu_to_le16(CIFS_UNLCK)) pLockData->fl_type = F_UNLCK; + else { + if (parm_data->lock_type == + __constant_cpu_to_le16(CIFS_RDLCK)) + pLockData->fl_type = F_RDLCK; + else if (parm_data->lock_type == + __constant_cpu_to_le16(CIFS_WRLCK)) + pLockData->fl_type = F_WRLCK; + + pLockData->fl_start = parm_data->start; + pLockData->fl_end = parm_data->start + + parm_data->length - 1; + pLockData->fl_pid = parm_data->pid; + } } plk_err_exit: diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ca2ba7a0193..d9e86504b9d 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -838,8 +838,32 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) } else { /* if rc == ERR_SHARING_VIOLATION ? */ - rc = 0; /* do not change lock type to unlock - since range in use */ + rc = 0; + + if (lockType & LOCKING_ANDX_SHARED_LOCK) { + pfLock->fl_type = F_WRLCK; + } else { + rc = CIFSSMBLock(xid, tcon, netfid, length, + pfLock->fl_start, 0, 1, + lockType | LOCKING_ANDX_SHARED_LOCK, + 0 /* wait flag */); + if (rc == 0) { + rc = CIFSSMBLock(xid, tcon, netfid, + length, pfLock->fl_start, 1, 0, + lockType | + LOCKING_ANDX_SHARED_LOCK, + 0 /* wait flag */); + pfLock->fl_type = F_RDLCK; + if (rc != 0) + cERROR(1, ("Error unlocking " + "previously locked range %d " + "during test of lock", rc)); + rc = 0; + } else { + pfLock->fl_type = F_WRLCK; + rc = 0; + } + } } FreeXid(xid); -- cgit v1.2.3-70-g09d2 From 55ab3a1ff843e3f0e24d2da44e71bffa5d853010 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Tue, 6 Apr 2010 14:34:58 -0700 Subject: raw: fsync method is now required Commit 148f948ba877f4d3cdef036b1ff6d9f68986706a (vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode) broke the raw driver. We now call through generic_file_aio_write -> generic_write_sync -> vfs_fsync_range. vfs_fsync_range has: if (!fop || !fop->fsync) { ret = -EINVAL; goto out; } But drivers/char/raw.c doesn't set an fsync method. We have two options: fix it or remove the raw driver completely. I'm happy to do either, the fact this has been broken for so long suggests it is rarely used. The patch below adds an fsync method to the raw driver. My knowledge of the block layer is pretty sketchy so this could do with a once over. If we instead decide to remove the raw driver, this patch might still be useful as a backport to 2.6.33 and 2.6.32. Signed-off-by: Anton Blanchard Reviewed-by: Jan Kara Cc: Christoph Hellwig Cc: Alexander Viro Cc: Jens Axboe Reviewed-by: Jeff Moyer Tested-by: Jeff Moyer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/raw.c | 1 + fs/block_dev.c | 3 ++- include/linux/fs.h | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index d331c59b571..f5a32317b75 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -248,6 +248,7 @@ static const struct file_operations raw_fops = { .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = blkdev_aio_write, + .fsync = block_fsync, .open = raw_open, .release= raw_release, .ioctl = raw_ioctl, diff --git a/fs/block_dev.c b/fs/block_dev.c index d11d0289f3d..8db62b2b6df 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -404,7 +404,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) * NULL first argument is nfsd_sync_dir() and that's not a directory. */ -static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) +int block_fsync(struct file *filp, struct dentry *dentry, int datasync) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; @@ -418,6 +418,7 @@ static int block_fsync(struct file *filp, struct dentry *dentry, int datasync) error = 0; return error; } +EXPORT_SYMBOL(block_fsync); /* * pseudo-fs diff --git a/include/linux/fs.h b/include/linux/fs.h index 10b8dedcd18..5d9c7e27c5a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2212,6 +2212,7 @@ extern int generic_segment_checks(const struct iovec *iov, /* fs/block_dev.c */ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); +extern int block_fsync(struct file *filp, struct dentry *dentry, int datasync); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, -- cgit v1.2.3-70-g09d2 From b1dd3b2843b3b73b7fc2ee47d96310cd1c051371 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 6 Apr 2010 14:35:00 -0700 Subject: vfs: rename block_fsync() to blkdev_fsync() Requested by hch, for consistency now it is exported. Cc: Alexander Viro Cc: Anton Blanchard Cc: Christoph Hellwig Cc: Jan Kara Cc: Jeff Moyer Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/char/raw.c | 2 +- fs/block_dev.c | 6 +++--- include/linux/fs.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/drivers/char/raw.c b/drivers/char/raw.c index f5a32317b75..8756ab0daa8 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -248,7 +248,7 @@ static const struct file_operations raw_fops = { .aio_read = generic_file_aio_read, .write = do_sync_write, .aio_write = blkdev_aio_write, - .fsync = block_fsync, + .fsync = blkdev_fsync, .open = raw_open, .release= raw_release, .ioctl = raw_ioctl, diff --git a/fs/block_dev.c b/fs/block_dev.c index 8db62b2b6df..2a6d0193f13 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -404,7 +404,7 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) * NULL first argument is nfsd_sync_dir() and that's not a directory. */ -int block_fsync(struct file *filp, struct dentry *dentry, int datasync) +int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; @@ -418,7 +418,7 @@ int block_fsync(struct file *filp, struct dentry *dentry, int datasync) error = 0; return error; } -EXPORT_SYMBOL(block_fsync); +EXPORT_SYMBOL(blkdev_fsync); /* * pseudo-fs @@ -1482,7 +1482,7 @@ const struct file_operations def_blk_fops = { .aio_read = generic_file_aio_read, .aio_write = blkdev_aio_write, .mmap = generic_file_mmap, - .fsync = block_fsync, + .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, diff --git a/include/linux/fs.h b/include/linux/fs.h index 5d9c7e27c5a..39d57bc6cc7 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2212,7 +2212,7 @@ extern int generic_segment_checks(const struct iovec *iov, /* fs/block_dev.c */ extern ssize_t blkdev_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos); -extern int block_fsync(struct file *filp, struct dentry *dentry, int datasync); +extern int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync); /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, -- cgit v1.2.3-70-g09d2 From 116354d177ba2da37e91cf884e3d11e67f825efd Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 6 Apr 2010 14:35:04 -0700 Subject: pagemap: fix pfn calculation for hugepage When we look into pagemap using page-types with option -p, the value of pfn for hugepages looks wrong (see below.) This is because pte was evaluated only once for one vma although it should be updated for each hugepage. This patch fixes it. $ page-types -p 3277 -Nl -b huge voffset offset len flags 7f21e8a00 11e400 1 ___U___________H_G________________ 7f21e8a01 11e401 1ff ________________TG________________ ^^^ 7f21e8c00 11e400 1 ___U___________H_G________________ 7f21e8c01 11e401 1ff ________________TG________________ ^^^ One hugepage contains 1 head page and 511 tail pages in x86_64 and each two lines represent each hugepage. Voffset and offset mean virtual address and physical address in the page unit, respectively. The different hugepages should not have the same offset value. With this patch applied: $ page-types -p 3386 -Nl -b huge voffset offset len flags 7fec7a600 112c00 1 ___UD__________H_G________________ 7fec7a601 112c01 1ff ________________TG________________ ^^^ 7fec7a800 113200 1 ___UD__________H_G________________ 7fec7a801 113201 1ff ________________TG________________ ^^^ OK More info: - This patch modifies walk_page_range()'s hugepage walker. But the change only affects pagemap_read(), which is the only caller of hugepage callback. - Without this patch, hugetlb_entry() callback is called per vma, that doesn't match the natural expectation from its name. - With this patch, hugetlb_entry() is called per hugepte entry and the callback can become much simpler. Signed-off-by: Naoya Horiguchi Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 27 +++++++-------------------- include/linux/mm.h | 4 ++-- mm/pagewalk.c | 47 +++++++++++++++++++++++++++++++++++++---------- 3 files changed, 46 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a05a669510a..070553427dd 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -662,31 +662,18 @@ static u64 huge_pte_to_pagemap_entry(pte_t pte, int offset) return pme; } -static int pagemap_hugetlb_range(pte_t *pte, unsigned long addr, - unsigned long end, struct mm_walk *walk) +/* This function walks within one hugetlb entry in the single call */ +static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) { - struct vm_area_struct *vma; struct pagemapread *pm = walk->private; - struct hstate *hs = NULL; int err = 0; + u64 pfn; - vma = find_vma(walk->mm, addr); - if (vma) - hs = hstate_vma(vma); for (; addr != end; addr += PAGE_SIZE) { - u64 pfn = PM_NOT_PRESENT; - - if (vma && (addr >= vma->vm_end)) { - vma = find_vma(walk->mm, addr); - if (vma) - hs = hstate_vma(vma); - } - - if (vma && (vma->vm_start <= addr) && is_vm_hugetlb_page(vma)) { - /* calculate pfn of the "raw" page in the hugepage. */ - int offset = (addr & ~huge_page_mask(hs)) >> PAGE_SHIFT; - pfn = huge_pte_to_pagemap_entry(*pte, offset); - } + int offset = (addr & ~hmask) >> PAGE_SHIFT; + pfn = huge_pte_to_pagemap_entry(*pte, offset); err = add_to_pagemap(addr, pfn, pm); if (err) return err; diff --git a/include/linux/mm.h b/include/linux/mm.h index e70f21beb4b..462acaf36f3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -783,8 +783,8 @@ struct mm_walk { int (*pmd_entry)(pmd_t *, unsigned long, unsigned long, struct mm_walk *); int (*pte_entry)(pte_t *, unsigned long, unsigned long, struct mm_walk *); int (*pte_hole)(unsigned long, unsigned long, struct mm_walk *); - int (*hugetlb_entry)(pte_t *, unsigned long, unsigned long, - struct mm_walk *); + int (*hugetlb_entry)(pte_t *, unsigned long, + unsigned long, unsigned long, struct mm_walk *); struct mm_struct *mm; void *private; }; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 7b47a57b664..8b1a2ce21ee 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -80,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + pte_t *pte; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = huge_pte_offset(walk->mm, addr & hmask); + if (pte && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (err) + return err; + } while (addr = next, addr != end); + + return 0; +} +#endif + /** * walk_page_range - walk a memory map's page tables with a callback * @mm: memory map to walk @@ -128,20 +159,16 @@ int walk_page_range(unsigned long addr, unsigned long end, vma = find_vma(walk->mm, addr); #ifdef CONFIG_HUGETLB_PAGE if (vma && is_vm_hugetlb_page(vma)) { - pte_t *pte; - struct hstate *hs; - if (vma->vm_end < next) next = vma->vm_end; - hs = hstate_vma(vma); - pte = huge_pte_offset(walk->mm, - addr & huge_page_mask(hs)); - if (pte && !huge_pte_none(huge_ptep_get(pte)) - && walk->hugetlb_entry) - err = walk->hugetlb_entry(pte, addr, - next, walk); + /* + * Hugepage is very tightly coupled with vma, so + * walk through hugetlb entries within a given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); if (err) break; + pgd = pgd_offset(walk->mm, next); continue; } #endif -- cgit v1.2.3-70-g09d2 From cc4fc29e59c386919a7674e203be7822dc968dc0 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 6 Apr 2010 14:35:09 -0700 Subject: fs-cache: order the debugfs stats correctly Order the debugfs statistics correctly. The values displayed through a seq_printf() statement should be in the same order as the names in the format string. In the 'Lookups' line, objects created ('crt=') and lookups timed out ('tmo=') have their values transposed. Signed-off-by: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fscache/stats.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 46435f3aae6..4765190d537 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -165,8 +165,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_object_lookups), atomic_read(&fscache_n_object_lookups_negative), atomic_read(&fscache_n_object_lookups_positive), - atomic_read(&fscache_n_object_lookups_timed_out), - atomic_read(&fscache_n_object_created)); + atomic_read(&fscache_n_object_created), + atomic_read(&fscache_n_object_lookups_timed_out)); seq_printf(m, "Updates: n=%u nul=%u run=%u\n", atomic_read(&fscache_n_updates), -- cgit v1.2.3-70-g09d2 From 04287f975e68038051eb9c79896866d36610b8e0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 8 Apr 2010 00:06:07 +0100 Subject: Have nfs ->d_revalidate() report errors properly If nfs atomic open implementation ends up doing open request from ->d_revalidate() codepath and gets an error from server, return that error to caller explicitly and don't bother with lookup_instantiate_filp() at all. ->d_revalidate() can return an error itself just fine... See http://bugzilla.kernel.org/show_bug.cgi?id=15674 http://marc.info/?l=linux-kernel&m=126988782722711&w=2 for original report. Reported-by: Daniel J Blueman Signed-off-by: Al Viro Signed-off-by: Linus Torvalds --- fs/nfs/nfs4proc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d79a7b37e56..fe0cd9eb1d4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2068,8 +2068,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st case -EDQUOT: case -ENOSPC: case -EROFS: - lookup_instantiate_filp(nd, (struct dentry *)state, NULL); - return 1; + return PTR_ERR(state); default: goto out_drop; } -- cgit v1.2.3-70-g09d2 From 69ecbbedac8e353bbd924fad16fed0c7c54e6382 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 15 Mar 2010 11:21:13 +0300 Subject: udf: potential integer overflow bloc->logicalBlockNum is unsigned so it's never less than zero. When I saw that, it made me worry that "bloc->logicalBlockNum + count" could overflow. That's why I changed the check for less than zero to an overflow check. (The test works because "count" is also unsigned.) Signed-off-by: Dan Carpenter Signed-off-by: Jan Kara --- fs/udf/balloc.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 19626e2491c..9a9378b4eb5 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -125,9 +125,8 @@ static void udf_bitmap_free_blocks(struct super_block *sb, mutex_lock(&sbi->s_alloc_mutex); partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; - if (bloc->logicalBlockNum < 0 || - (bloc->logicalBlockNum + count) > - partmap->s_partition_len) { + if (bloc->logicalBlockNum + count < count || + (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, partmap->s_partition_len); @@ -393,9 +392,8 @@ static void udf_table_free_blocks(struct super_block *sb, mutex_lock(&sbi->s_alloc_mutex); partmap = &sbi->s_partmaps[bloc->partitionReferenceNum]; - if (bloc->logicalBlockNum < 0 || - (bloc->logicalBlockNum + count) > - partmap->s_partition_len) { + if (bloc->logicalBlockNum + count < count || + (bloc->logicalBlockNum + count) > partmap->s_partition_len) { udf_debug("%d < %d || %d + %d > %d\n", bloc->logicalBlockNum, 0, bloc->logicalBlockNum, count, partmap->s_partition_len); -- cgit v1.2.3-70-g09d2 From c15d0fc0fc399d2639240b35ad7ed93ed5a59412 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Mon, 29 Mar 2010 11:05:21 +0400 Subject: udf: add speciffic ->setattr callback generic setattr not longer responsible for quota transfer. use udf_setattr for all udf's inodes. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/udf/file.c | 2 +- fs/udf/inode.c | 2 +- fs/udf/namei.c | 9 ++++++++- fs/udf/udfdecl.h | 3 ++- 4 files changed, 12 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/udf/file.c b/fs/udf/file.c index 1eb06774ed9..4b6a46ccbf4 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -218,7 +218,7 @@ const struct file_operations udf_file_operations = { .llseek = generic_file_llseek, }; -static int udf_setattr(struct dentry *dentry, struct iattr *iattr) +int udf_setattr(struct dentry *dentry, struct iattr *iattr) { struct inode *inode = dentry->d_inode; int error; diff --git a/fs/udf/inode.c b/fs/udf/inode.c index bb863fe579a..8a3fbd177ca 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1314,7 +1314,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &udf_symlink_inode_operations; inode->i_mode = S_IFLNK | S_IRWXUGO; break; case ICBTAG_FILE_TYPE_MAIN: diff --git a/fs/udf/namei.c b/fs/udf/namei.c index db423ab078b..75816025f95 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -925,7 +925,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, iinfo = UDF_I(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &page_symlink_inode_operations; + inode->i_op = &udf_symlink_inode_operations; if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { struct kernel_lb_addr eloc; @@ -1393,6 +1393,7 @@ const struct export_operations udf_export_ops = { const struct inode_operations udf_dir_inode_operations = { .lookup = udf_lookup, .create = udf_create, + .setattr = udf_setattr, .link = udf_link, .unlink = udf_unlink, .symlink = udf_symlink, @@ -1401,3 +1402,9 @@ const struct inode_operations udf_dir_inode_operations = { .mknod = udf_mknod, .rename = udf_rename, }; +const struct inode_operations udf_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = udf_setattr, +}; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 4223ac855da..702a1148e70 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -76,6 +76,7 @@ extern const struct inode_operations udf_dir_inode_operations; extern const struct file_operations udf_dir_operations; extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; +extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; @@ -131,7 +132,7 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, /* file.c */ extern int udf_ioctl(struct inode *, struct file *, unsigned int, unsigned long); - +extern int udf_setattr(struct dentry *dentry, struct iattr *iattr); /* inode.c */ extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); extern int udf_sync_inode(struct inode *); -- cgit v1.2.3-70-g09d2 From 2844a76a25a2fc2f5025cf128c95a14d86146d33 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 9 Apr 2010 15:46:42 -0700 Subject: ceph: decode v5 of osdmap (pool names) [protocol change] Teach the client to decode an updated format for the osdmap. The new format includes pool names, which will be useful shortly. Get this change in earlier rather than later. Signed-off-by: Sage Weil --- fs/ceph/osdmap.c | 180 +++++++++++++++++++++++++++++++++---------------------- fs/ceph/osdmap.h | 1 + fs/ceph/rados.h | 6 +- 3 files changed, 114 insertions(+), 73 deletions(-) (limited to 'fs') diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c index d82fe87c2a6..7526d6d3301 100644 --- a/fs/ceph/osdmap.c +++ b/fs/ceph/osdmap.c @@ -312,71 +312,6 @@ bad: return ERR_PTR(err); } - -/* - * osd map - */ -void ceph_osdmap_destroy(struct ceph_osdmap *map) -{ - dout("osdmap_destroy %p\n", map); - if (map->crush) - crush_destroy(map->crush); - while (!RB_EMPTY_ROOT(&map->pg_temp)) { - struct ceph_pg_mapping *pg = - rb_entry(rb_first(&map->pg_temp), - struct ceph_pg_mapping, node); - rb_erase(&pg->node, &map->pg_temp); - kfree(pg); - } - while (!RB_EMPTY_ROOT(&map->pg_pools)) { - struct ceph_pg_pool_info *pi = - rb_entry(rb_first(&map->pg_pools), - struct ceph_pg_pool_info, node); - rb_erase(&pi->node, &map->pg_pools); - kfree(pi); - } - kfree(map->osd_state); - kfree(map->osd_weight); - kfree(map->osd_addr); - kfree(map); -} - -/* - * adjust max osd value. reallocate arrays. - */ -static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) -{ - u8 *state; - struct ceph_entity_addr *addr; - u32 *weight; - - state = kcalloc(max, sizeof(*state), GFP_NOFS); - addr = kcalloc(max, sizeof(*addr), GFP_NOFS); - weight = kcalloc(max, sizeof(*weight), GFP_NOFS); - if (state == NULL || addr == NULL || weight == NULL) { - kfree(state); - kfree(addr); - kfree(weight); - return -ENOMEM; - } - - /* copy old? */ - if (map->osd_state) { - memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); - memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); - memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); - kfree(map->osd_state); - kfree(map->osd_addr); - kfree(map->osd_weight); - } - - map->osd_state = state; - map->osd_weight = weight; - map->osd_addr = addr; - map->max_osd = max; - return 0; -} - /* * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid * to a set of osds) @@ -480,6 +415,13 @@ static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id) return NULL; } +static void __remove_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *pi) +{ + rb_erase(&pi->node, root); + kfree(pi->name); + kfree(pi); +} + void __decode_pool(void **p, struct ceph_pg_pool_info *pi) { ceph_decode_copy(p, &pi->v, sizeof(pi->v)); @@ -488,6 +430,98 @@ void __decode_pool(void **p, struct ceph_pg_pool_info *pi) *p += le32_to_cpu(pi->v.num_removed_snap_intervals) * sizeof(u64) * 2; } +static int __decode_pool_names(void **p, void *end, struct ceph_osdmap *map) +{ + struct ceph_pg_pool_info *pi; + u32 num, len, pool; + + ceph_decode_32_safe(p, end, num, bad); + dout(" %d pool names\n", num); + while (num--) { + ceph_decode_32_safe(p, end, pool, bad); + ceph_decode_32_safe(p, end, len, bad); + dout(" pool %d len %d\n", pool, len); + pi = __lookup_pg_pool(&map->pg_pools, pool); + if (pi) { + kfree(pi->name); + pi->name = kmalloc(len + 1, GFP_NOFS); + if (pi->name) { + memcpy(pi->name, *p, len); + pi->name[len] = '\0'; + dout(" name is %s\n", pi->name); + } + } + *p += len; + } + return 0; + +bad: + return -EINVAL; +} + +/* + * osd map + */ +void ceph_osdmap_destroy(struct ceph_osdmap *map) +{ + dout("osdmap_destroy %p\n", map); + if (map->crush) + crush_destroy(map->crush); + while (!RB_EMPTY_ROOT(&map->pg_temp)) { + struct ceph_pg_mapping *pg = + rb_entry(rb_first(&map->pg_temp), + struct ceph_pg_mapping, node); + rb_erase(&pg->node, &map->pg_temp); + kfree(pg); + } + while (!RB_EMPTY_ROOT(&map->pg_pools)) { + struct ceph_pg_pool_info *pi = + rb_entry(rb_first(&map->pg_pools), + struct ceph_pg_pool_info, node); + __remove_pg_pool(&map->pg_pools, pi); + } + kfree(map->osd_state); + kfree(map->osd_weight); + kfree(map->osd_addr); + kfree(map); +} + +/* + * adjust max osd value. reallocate arrays. + */ +static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) +{ + u8 *state; + struct ceph_entity_addr *addr; + u32 *weight; + + state = kcalloc(max, sizeof(*state), GFP_NOFS); + addr = kcalloc(max, sizeof(*addr), GFP_NOFS); + weight = kcalloc(max, sizeof(*weight), GFP_NOFS); + if (state == NULL || addr == NULL || weight == NULL) { + kfree(state); + kfree(addr); + kfree(weight); + return -ENOMEM; + } + + /* copy old? */ + if (map->osd_state) { + memcpy(state, map->osd_state, map->max_osd*sizeof(*state)); + memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr)); + memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight)); + kfree(map->osd_state); + kfree(map->osd_addr); + kfree(map->osd_weight); + } + + map->osd_state = state; + map->osd_weight = weight; + map->osd_addr = addr; + map->max_osd = max; + return 0; +} + /* * decode a full map. */ @@ -524,7 +558,7 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) ceph_decode_32_safe(p, end, max, bad); while (max--) { ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad); - pi = kmalloc(sizeof(*pi), GFP_NOFS); + pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) goto bad; pi->id = ceph_decode_32(p); @@ -537,6 +571,10 @@ struct ceph_osdmap *osdmap_decode(void **p, void *end) __decode_pool(p, pi); __insert_pg_pool(&map->pg_pools, pi); } + + if (version >= 5 && __decode_pool_names(p, end, map) < 0) + goto bad; + ceph_decode_32_safe(p, end, map->pool_max, bad); ceph_decode_32_safe(p, end, map->flags, bad); @@ -710,7 +748,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } pi = __lookup_pg_pool(&map->pg_pools, pool); if (!pi) { - pi = kmalloc(sizeof(*pi), GFP_NOFS); + pi = kzalloc(sizeof(*pi), GFP_NOFS); if (!pi) { err = -ENOMEM; goto bad; @@ -720,6 +758,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, } __decode_pool(p, pi); } + if (version >= 5 && __decode_pool_names(p, end, map) < 0) + goto bad; /* old_pool */ ceph_decode_32_safe(p, end, len, bad); @@ -728,10 +768,8 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ceph_decode_32_safe(p, end, pool, bad); pi = __lookup_pg_pool(&map->pg_pools, pool); - if (pi) { - rb_erase(&pi->node, &map->pg_pools); - kfree(pi); - } + if (pi) + __remove_pg_pool(&map->pg_pools, pi); } /* new_up */ diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h index 1fb55afb264..8bc9f1e4f56 100644 --- a/fs/ceph/osdmap.h +++ b/fs/ceph/osdmap.h @@ -23,6 +23,7 @@ struct ceph_pg_pool_info { int id; struct ceph_pg_pool v; int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; + char *name; }; struct ceph_pg_mapping { diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h index 26ac8b89a67..a1fc1d017b5 100644 --- a/fs/ceph/rados.h +++ b/fs/ceph/rados.h @@ -11,8 +11,10 @@ /* * osdmap encoding versions */ -#define CEPH_OSDMAP_INC_VERSION 4 -#define CEPH_OSDMAP_VERSION 4 +#define CEPH_OSDMAP_INC_VERSION 5 +#define CEPH_OSDMAP_INC_VERSION_EXT 5 +#define CEPH_OSDMAP_VERSION 5 +#define CEPH_OSDMAP_VERSION_EXT 5 /* * fs id -- cgit v1.2.3-70-g09d2 From 80e60639f1b7c121a7fea53920c5a4b94009361a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2010 13:51:05 -0400 Subject: NFSv4: Fall back to ordinary lookup if nfs4_atomic_open() returns EISDIR Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c6f2750648f..be46f26c9a5 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1025,12 +1025,12 @@ static struct dentry *nfs_atomic_lookup(struct inode *dir, struct dentry *dentry res = NULL; goto out; /* This turned out not to be a regular file */ + case -EISDIR: case -ENOTDIR: goto no_open; case -ELOOP: if (!(nd->intent.open.flags & O_NOFOLLOW)) goto no_open; - /* case -EISDIR: */ /* case -EINVAL: */ default: goto out; -- cgit v1.2.3-70-g09d2 From 1544fa0f7a46241582abc48f07b74f3d846379e4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 25 Mar 2010 13:54:49 -0400 Subject: NFS: Fix the mode calculation in nfs_find_open_context Signed-off-by: Trond Myklebust --- fs/nfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 737128f777f..50a56edca0b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -623,10 +623,10 @@ struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_c list_for_each_entry(pos, &nfsi->open_files, list) { if (cred != NULL && pos->cred != cred) continue; - if ((pos->mode & mode) == mode) { - ctx = get_nfs_open_context(pos); - break; - } + if ((pos->mode & (FMODE_READ|FMODE_WRITE)) != mode) + continue; + ctx = get_nfs_open_context(pos); + break; } spin_unlock(&inode->i_lock); return ctx; -- cgit v1.2.3-70-g09d2 From b80c3cb628f0ebc241b02e38dd028969fb8026a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Apr 2010 19:07:07 -0400 Subject: NFS: Ensure that writeback_single_inode() calls write_inode() when syncing Since writeback_single_inode() checks the inode->i_state flags _before_ it flushes out the data, we need to ensure that the I_DIRTY_DATASYNC flag is already set. Otherwise we risk not seeing a call to write_inode(), which again means that we break fsync() et al... Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 53ff70e2399..7f40ea30554 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -421,6 +421,7 @@ static void nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); + __mark_inode_dirty(req->wb_page->mapping->host, I_DIRTY_DATASYNC); } #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) @@ -660,6 +661,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, req = nfs_setup_write_request(ctx, page, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + nfs_mark_request_dirty(req); /* Update file length */ nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); @@ -739,8 +741,6 @@ int nfs_updatepage(struct file *file, struct page *page, status = nfs_writepage_setup(ctx, page, offset, count); if (status < 0) nfs_set_pageerror(page); - else - __set_page_dirty_nobuffers(page); dprintk("NFS: nfs_updatepage returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); -- cgit v1.2.3-70-g09d2 From a6305ddb080fb483ca41ca56cacb6f96089f0c8e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Apr 2010 19:07:08 -0400 Subject: NFS: Fix a race with the new commit code This patch fixes a race which occurs due to the fact that we release the PG_writeback flag while still holding the nfs_page locked. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7f40ea30554..40297c4f1ee 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -201,6 +201,7 @@ static int nfs_set_page_writeback(struct page *page) struct inode *inode = page->mapping->host; struct nfs_server *nfss = NFS_SERVER(inode); + page_cache_get(page); if (atomic_long_inc_return(&nfss->writeback) > NFS_CONGESTION_ON_THRESH) { set_bdi_congested(&nfss->backing_dev_info, @@ -216,6 +217,7 @@ static void nfs_end_page_writeback(struct page *page) struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); + page_cache_release(page); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); } @@ -665,6 +667,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, struct page *page, /* Update file length */ nfs_grow_file(page, offset, count); nfs_mark_uptodate(page, req->wb_pgbase, req->wb_bytes); + nfs_mark_request_dirty(req); nfs_clear_page_tag_locked(req); return 0; } @@ -749,13 +752,12 @@ int nfs_updatepage(struct file *file, struct page *page, static void nfs_writepage_release(struct nfs_page *req) { + struct page *page = req->wb_page; - if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) { - nfs_end_page_writeback(req->wb_page); + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req)) nfs_inode_remove_request(req); - } else - nfs_end_page_writeback(req->wb_page); nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); } static int flush_task_priority(int how) @@ -847,9 +849,11 @@ static int nfs_write_rpcsetup(struct nfs_page *req, */ static void nfs_redirty_request(struct nfs_page *req) { + struct page *page = req->wb_page; + nfs_mark_request_dirty(req); - nfs_end_page_writeback(req->wb_page); nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); } /* @@ -1084,16 +1088,15 @@ static void nfs_writeback_release_full(void *calldata) if (nfs_write_need_commit(data)) { memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); nfs_mark_request_commit(req); - nfs_end_page_writeback(page); dprintk(" marked for commit\n"); goto next; } dprintk(" OK\n"); remove_request: - nfs_end_page_writeback(page); nfs_inode_remove_request(req); next: nfs_clear_page_tag_locked(req); + nfs_end_page_writeback(page); } nfs_writedata_release(calldata); } -- cgit v1.2.3-70-g09d2 From 2c61be0a9478258f77b66208a0c4b1f5f8161c3c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Apr 2010 19:54:50 -0400 Subject: NFS: Ensure that the WRITE and COMMIT RPC calls are always uninterruptible We always want to ensure that WRITE and COMMIT completes, whether or not the user presses ^C. Do this by making the call asynchronous, and allowing the user to do an interruptible wait for rpc_task completion. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 40297c4f1ee..de38d63aa92 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -781,7 +781,6 @@ static int nfs_write_rpcsetup(struct nfs_page *req, int how) { struct inode *inode = req->wb_context->path.dentry->d_inode; - int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { @@ -796,9 +795,10 @@ static int nfs_write_rpcsetup(struct nfs_page *req, .callback_ops = call_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = flags, + .flags = RPC_TASK_ASYNC, .priority = priority, }; + int ret = 0; /* Set up the RPC argument and reply structs * NB: take care not to mess about with data->commit et al. */ @@ -837,10 +837,18 @@ static int nfs_write_rpcsetup(struct nfs_page *req, (unsigned long long)data->args.offset); task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) - return PTR_ERR(task); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + if (how & FLUSH_SYNC) { + ret = rpc_wait_for_completion_task(task); + if (ret == 0) + ret = task->tk_status; + } rpc_put_task(task); - return 0; +out: + return ret; } /* If a nfs_flush_* function fails, it should remove reqs from @head and @@ -1210,7 +1218,6 @@ static int nfs_commit_rpcsetup(struct list_head *head, { struct nfs_page *first = nfs_list_entry(head->next); struct inode *inode = first->wb_context->path.dentry->d_inode; - int flags = (how & FLUSH_SYNC) ? 0 : RPC_TASK_ASYNC; int priority = flush_task_priority(how); struct rpc_task *task; struct rpc_message msg = { @@ -1225,7 +1232,7 @@ static int nfs_commit_rpcsetup(struct list_head *head, .callback_ops = &nfs_commit_ops, .callback_data = data, .workqueue = nfsiod_workqueue, - .flags = flags, + .flags = RPC_TASK_ASYNC, .priority = priority, }; @@ -1255,6 +1262,8 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); + if (how & FLUSH_SYNC) + rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } -- cgit v1.2.3-70-g09d2 From be3bd2223b89d270853302ab0a5909fa875fd831 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 12 Apr 2010 01:51:03 +0900 Subject: nilfs2: fix typo "numer" -> "number" in alloc.c Fixes the typo found in a warning message of a persistent object allocator function. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c index 3f959f1879d..c2a13870c60 100644 --- a/fs/nilfs2/alloc.c +++ b/fs/nilfs2/alloc.c @@ -425,7 +425,7 @@ void nilfs_palloc_abort_alloc_entry(struct inode *inode, bitmap = bitmap_kaddr + bh_offset(req->pr_bitmap_bh); if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group), group_offset, bitmap)) - printk(KERN_WARNING "%s: entry numer %llu already freed\n", + printk(KERN_WARNING "%s: entry number %llu already freed\n", __func__, (unsigned long long)req->pr_entry_nr); nilfs_palloc_group_desc_add_entries(inode, group, desc, 1); -- cgit v1.2.3-70-g09d2 From 0df5dd4aae211edeeeb84f7f84f6d093406d7c22 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 11 Apr 2010 16:48:44 -0400 Subject: NFSv4: fix delegated locking Arnaud Giersch reports that NFSv4 locking is broken when we hold a delegation since commit 8e469ebd6dc32cbaf620e134d79f740bf0ebab79 (NFSv4: Don't allow posix locking against servers that don't support it). According to Arnaud, the lock succeeds the first time he opens the file (since we cannot do a delegated open) but then fails after we start using delegated opens. The following patch fixes it by ensuring that locking behaviour is governed by a per-filesystem capability flag that is initially set, but gets cleared if the server ever returns an OPEN without the NFS4_OPEN_RESULT_LOCKTYPE_POSIX flag being set. Reported-by: Arnaud Giersch Signed-off-by: Trond Myklebust Cc: stable@kernel.org --- fs/nfs/client.c | 3 ++- fs/nfs/nfs4proc.c | 4 +++- include/linux/nfs_fs_sb.h | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 2a3d352c0bf..a8766c4ef2e 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1294,7 +1294,8 @@ static int nfs4_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = data->flags; - server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR; + server->caps |= NFS_CAP_ATOMIC_OPEN|NFS_CAP_CHANGE_ATTR| + NFS_CAP_POSIX_LOCK; server->options = data->options; /* Get a client record */ diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index fe0cd9eb1d4..638067007c6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -1523,6 +1523,8 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) nfs_post_op_update_inode(dir, o_res->dir_attr); } else nfs_refresh_inode(dir, o_res->dir_attr); + if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0) + server->caps &= ~NFS_CAP_POSIX_LOCK; if(o_res->rflags & NFS4_OPEN_RESULT_CONFIRM) { status = _nfs4_proc_open_confirm(data); if (status != 0) @@ -1664,7 +1666,7 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in status = PTR_ERR(state); if (IS_ERR(state)) goto err_opendata_put; - if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) + if (server->caps & NFS_CAP_POSIX_LOCK) set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 717a5e54eb1..e82957acea5 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -176,6 +176,7 @@ struct nfs_server { #define NFS_CAP_ATIME (1U << 11) #define NFS_CAP_CTIME (1U << 12) #define NFS_CAP_MTIME (1U << 13) +#define NFS_CAP_POSIX_LOCK (1U << 14) /* maximum number of slots to use */ -- cgit v1.2.3-70-g09d2 From fc7683a3c30c22131b1651271d6bf9ea113b77c5 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 26 Mar 2010 19:29:54 +0300 Subject: ext2: symlink must be handled via filesystem specific operation generic setattr implementation is no longer responsible for quota transfer so synlinks must be handled via ext2_setattr. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext2/symlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 4e2426e22bb..565cf817bbf 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -32,6 +32,7 @@ const struct inode_operations ext2_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -43,6 +44,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext2_follow_link, + .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, -- cgit v1.2.3-70-g09d2 From 774f03fb2cf89951b5f5f363b7739a2835d5924e Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 26 Mar 2010 19:29:55 +0300 Subject: ext3: symlink must be handled via filesystem specific operation generic setattr implementation is no longer responsible for quota transfer so synlinks must be handled via ext3_setattr. Signed-off-by: Dmitry Monakhov Signed-off-by: Jan Kara --- fs/ext3/symlink.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index ff7b4ccd898..7c489820777 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -34,6 +34,7 @@ const struct inode_operations ext3_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, + .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -45,6 +46,7 @@ const struct inode_operations ext3_symlink_inode_operations = { const struct inode_operations ext3_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext3_follow_link, + .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, -- cgit v1.2.3-70-g09d2 From 4c5e6c0e70fd6ca2fa67184fd36a261b3b7b38d0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 6 Apr 2010 18:52:47 +0200 Subject: quota: Hide warnings about writes to the filesystem before quota was turned on For a root filesystem write to the filesystem before quota is turned on happens regularly and there's no way around it because of writes to syslog, /etc/mtab, and similar. So the warning is rather pointless for ordinary users. It's still useful during development so we just hide the warning behind __DQUOT_PARANOIA config option. Signed-off-by: Jan Kara --- fs/quota/dquot.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index e0b870f4749..f9a37513f24 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -874,14 +874,18 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; +#ifdef __DQUOT_PARANOIA int reserved = 0; +#endif spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; +#ifdef __DQUOT_PARANOIA if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; +#endif if (!atomic_read(&inode->i_writecount)) continue; if (!dqinit_needed(inode, type)) @@ -903,11 +907,13 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); +#ifdef __DQUOT_PARANOIA if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened before quota" " was turned on thus quota information is probably " "inconsistent. Please run quotacheck(8).\n", sb->s_id); } +#endif } /* -- cgit v1.2.3-70-g09d2 From 08261673cb6dc638c39f44d69b76fffb57b92a8b Mon Sep 17 00:00:00 2001 From: Andrew Perepechko Date: Mon, 12 Apr 2010 22:16:50 +0400 Subject: quota: Fix possible dq_flags corruption dq_flags are modified non-atomically in do_set_dqblk via __set_bit calls and atomically for example in mark_dquot_dirty or clear_dquot_dirty. Hence a change done by an atomic operation can be overwritten by a change done by a non-atomic one. Fix the problem by using atomic bitops even in do_set_dqblk. Signed-off-by: Andrew Perepechko Signed-off-by: Jan Kara --- fs/quota/dquot.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f9a37513f24..a0a9405b202 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2328,34 +2328,34 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) if (di->dqb_valid & QIF_SPACE) { dm->dqb_curspace = di->dqb_curspace - dm->dqb_rsvspace; check_blim = 1; - __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_BLIMITS) { dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); check_blim = 1; - __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_INODES) { dm->dqb_curinodes = di->dqb_curinodes; check_ilim = 1; - __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_ILIMITS) { dm->dqb_isoftlimit = di->dqb_isoftlimit; dm->dqb_ihardlimit = di->dqb_ihardlimit; check_ilim = 1; - __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_BTIME) { dm->dqb_btime = di->dqb_btime; check_blim = 1; - __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_ITIME) { dm->dqb_itime = di->dqb_itime; check_ilim = 1; - __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); } if (check_blim) { -- cgit v1.2.3-70-g09d2 From f5b066287c74b624583b993395a65d03a6487b3a Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 12 Apr 2010 14:24:28 -0700 Subject: ceph: fix dentry reference leak in dcache readdir When filldir returned an error (e.g. buffer full for a large directory), we would leak a dentry reference, causing an oops on umount. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index aed8fda3302..7505b4f1f59 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -170,11 +170,11 @@ more: spin_lock(&inode->i_lock); spin_lock(&dcache_lock); + last = dentry; + if (err < 0) goto out_unlock; - last = dentry; - p = p->prev; filp->f_pos++; -- cgit v1.2.3-70-g09d2 From 032d8f7268444a0f5d4ee02d9513d682d5b8edfc Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 13 Apr 2010 17:46:37 +0200 Subject: [LogFS] Prevent memory corruption on large deletes Removing sufficiently large files would create aliases for a large number of segments. This in turn results in a large number of journal entries and an overflow of s_je_array. Cheap fix is to add a BUG_ON, turning memory corruption into something annoying, but less dangerous. Real fix is to count the number of affected segments and prevent the problem completely. Signed-off-by: Joern Engel --- fs/logfs/gc.c | 8 ++++++++ fs/logfs/journal.c | 3 +++ fs/logfs/logfs.h | 8 +++++++- fs/logfs/readwrite.c | 14 ++++++++++++++ fs/logfs/super.c | 2 ++ 5 files changed, 34 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c index 92949f95a90..e8253e7fb6b 100644 --- a/fs/logfs/gc.c +++ b/fs/logfs/gc.c @@ -458,6 +458,14 @@ static void __logfs_gc_pass(struct super_block *sb, int target) struct logfs_block *block; int round, progress, last_progress = 0; + /* + * Doing too many changes to the segfile at once would result + * in a large number of aliases. Write the journal before + * things get out of hand. + */ + if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES) + logfs_write_anchor(sb); + if (no_free_segments(sb) >= target && super->s_no_object_aliases < MAX_OBJ_ALIASES) return; diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index d57c7b07b60..2c22a4ad532 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -493,6 +493,8 @@ static void account_shadows(struct super_block *sb) btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow); btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow); + btree_grim_visitor32(&tree->segment_map, 0, NULL); + tree->no_shadowed_segments = 0; if (li->li_block) { /* @@ -660,6 +662,7 @@ static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type, if (ofs < 0) return ofs; logfs_buf_write(area, ofs, super->s_compressed_je, len); + BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES); super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs); return 0; } diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 97195b9e93a..c9929eed80b 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -257,10 +257,14 @@ struct logfs_shadow { * struct shadow_tree * @new: shadows where old_ofs==0, indexed by new_ofs * @old: shadows where old_ofs!=0, indexed by old_ofs + * @segment_map: bitfield of segments containing shadows + * @no_shadowed_segment: number of segments containing shadows */ struct shadow_tree { struct btree_head64 new; struct btree_head64 old; + struct btree_head32 segment_map; + int no_shadowed_segments; }; struct object_alias_item { @@ -311,6 +315,8 @@ struct logfs_block_ops { write_alias_t *write_one_alias); }; +#define MAX_JOURNAL_ENTRIES 256 + struct logfs_super { struct mtd_info *s_mtd; /* underlying device */ struct block_device *s_bdev; /* underlying device */ @@ -377,7 +383,7 @@ struct logfs_super { u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */ u64 s_last_version; struct logfs_area *s_journal_area; /* open journal segment */ - __be64 s_je_array[64]; + __be64 s_je_array[MAX_JOURNAL_ENTRIES]; int s_no_je; int s_sum_index; /* for the 12 summaries */ diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 3659c37fbd7..7e0c39c4971 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1219,6 +1219,18 @@ static void free_shadow(struct inode *inode, struct logfs_shadow *shadow) mempool_free(shadow, super->s_shadow_pool); } +static void mark_segment(struct shadow_tree *tree, u32 segno) +{ + int err; + + if (!btree_lookup32(&tree->segment_map, segno)) { + err = btree_insert32(&tree->segment_map, segno, (void *)1, + GFP_NOFS); + BUG_ON(err); + tree->no_shadowed_segments++; + } +} + /** * fill_shadow_tree - Propagate shadow tree changes due to a write * @inode: Inode owning the page @@ -1266,6 +1278,8 @@ static void fill_shadow_tree(struct inode *inode, struct page *page, super->s_dirty_used_bytes += shadow->new_len; super->s_dirty_free_bytes += shadow->old_len; + mark_segment(tree, shadow->old_ofs >> super->s_segshift); + mark_segment(tree, shadow->new_ofs >> super->s_segshift); } } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index 9d856c49afc..d6e1f4fc311 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -451,6 +451,8 @@ static int logfs_read_sb(struct super_block *sb, int read_only) btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool); btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool); + btree_init_mempool32(&super->s_shadow_tree.segment_map, + super->s_btree_pool); ret = logfs_init_mapping(sb); if (ret) -- cgit v1.2.3-70-g09d2 From d3a03f8031000f8297823b80e36db536fd020884 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 13 Apr 2010 17:54:27 +0200 Subject: [LogFS] Plug 8 byte information leak Within each journal segment, 8 bytes at offset 24 would remain uninitialized. Signed-off-by: Joern Engel --- fs/logfs/journal.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 2c22a4ad532..2957bfc2192 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -388,7 +388,10 @@ static void journal_get_erase_count(struct logfs_area *area) static int journal_erase_segment(struct logfs_area *area) { struct super_block *sb = area->a_sb; - struct logfs_segment_header sh; + union { + struct logfs_segment_header sh; + unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)]; + } u; u64 ofs; int err; @@ -396,20 +399,21 @@ static int journal_erase_segment(struct logfs_area *area) if (err) return err; - sh.pad = 0; - sh.type = SEG_JOURNAL; - sh.level = 0; - sh.segno = cpu_to_be32(area->a_segno); - sh.ec = cpu_to_be32(area->a_erase_count); - sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); - sh.crc = logfs_crc32(&sh, sizeof(sh), 4); + memset(&u, 0, sizeof(u)); + u.sh.pad = 0; + u.sh.type = SEG_JOURNAL; + u.sh.level = 0; + u.sh.segno = cpu_to_be32(area->a_segno); + u.sh.ec = cpu_to_be32(area->a_erase_count); + u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec); + u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4); /* This causes a bug in segment.c. Not yet. */ //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0); ofs = dev_ofs(sb, area->a_segno, 0); - area->a_used_bytes = ALIGN(sizeof(sh), 16); - logfs_buf_write(area, ofs, &sh, sizeof(sh)); + area->a_used_bytes = sizeof(u); + logfs_buf_write(area, ofs, &u, sizeof(u)); return 0; } -- cgit v1.2.3-70-g09d2 From ead88af5f577fd2b399a0fcdfe52605116fac489 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 13 Apr 2010 17:57:21 +0200 Subject: [LogFS] Move assertion The assertion is valid independently of the condition. Signed-off-by: Joern Engel --- fs/logfs/journal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c index 2957bfc2192..fd44eeea513 100644 --- a/fs/logfs/journal.c +++ b/fs/logfs/journal.c @@ -612,9 +612,9 @@ static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type, if (len == 0) return logfs_write_header(super, header, 0, type); + BUG_ON(len > sb->s_blocksize); compr_len = logfs_compress(buf, data, len, sb->s_blocksize); if (compr_len < 0 || type == JE_ANCHOR) { - BUG_ON(len > sb->s_blocksize); memcpy(data, buf, len); compr_len = len; compr = COMPR_NONE; -- cgit v1.2.3-70-g09d2 From fc837c8f0446b73a1661339db406c0238dd1d184 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 13 Apr 2010 11:41:22 -0700 Subject: ceph: queue_cap_snap should always queue dirty context This simplifies the calling convention, and fixes a bug where we queue a capsnap with a context other than i_head_snapc (the one that matches the dirty pages). The result was a BUG at fs/ceph/caps.c:2178 on writeback completion when a capsnap matching the writeback snapc could not be found. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 16 +++++++--------- fs/ceph/super.h | 3 +-- 2 files changed, 8 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 7e3e5f9edaa..d197431532c 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -430,8 +430,7 @@ static int dup_array(u64 **dst, __le64 *src, int num) * Caller must hold snap_rwsem for read (i.e., the realm topology won't * change). */ -void ceph_queue_cap_snap(struct ceph_inode_info *ci, - struct ceph_snap_context *snapc) +void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; @@ -450,10 +449,11 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, as no new writes are allowed to start when pending, so any writes in progress now were started before the previous cap_snap. lucky us. */ - dout("queue_cap_snap %p snapc %p seq %llu used %d" - " already pending\n", inode, snapc, snapc->seq, used); + dout("queue_cap_snap %p already pending\n", inode); kfree(capsnap); } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) { + struct ceph_snap_context *snapc = ci->i_head_snapc; + igrab(inode); atomic_set(&capsnap->nref, 1); @@ -462,7 +462,6 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, INIT_LIST_HEAD(&capsnap->flushing_item); capsnap->follows = snapc->seq - 1; - capsnap->context = ceph_get_snap_context(snapc); capsnap->issued = __ceph_caps_issued(ci, NULL); capsnap->dirty = __ceph_caps_dirty(ci); @@ -479,7 +478,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci, snapshot. */ capsnap->dirty_pages = ci->i_wrbuffer_ref_head; ci->i_wrbuffer_ref_head = 0; - ceph_put_snap_context(ci->i_head_snapc); + capsnap->context = snapc; ci->i_head_snapc = NULL; list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); @@ -603,7 +602,7 @@ more: if (lastinode) iput(lastinode); lastinode = inode; - ceph_queue_cap_snap(ci, realm->cached_context); + ceph_queue_cap_snap(ci); spin_lock(&realm->inodes_with_caps_lock); } spin_unlock(&realm->inodes_with_caps_lock); @@ -825,8 +824,7 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, spin_unlock(&realm->inodes_with_caps_lock); spin_unlock(&inode->i_lock); - ceph_queue_cap_snap(ci, - ci->i_snap_realm->cached_context); + ceph_queue_cap_snap(ci); iput(inode); continue; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 65d12036b67..4c07acaf21e 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -714,8 +714,7 @@ extern int ceph_update_snap_trace(struct ceph_mds_client *m, extern void ceph_handle_snap(struct ceph_mds_client *mdsc, struct ceph_mds_session *session, struct ceph_msg *msg); -extern void ceph_queue_cap_snap(struct ceph_inode_info *ci, - struct ceph_snap_context *snapc); +extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_cap_snap *capsnap); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); -- cgit v1.2.3-70-g09d2 From e1e4dd0caa63e166afa46a1ccc947bebb4f66bcf Mon Sep 17 00:00:00 2001 From: Yehuda Sadeh Date: Tue, 13 Apr 2010 11:45:56 -0700 Subject: ceph: reserve one more caps space when doing readdir We were missing space for the directory cap. The result was a BUG at fs/ceph/caps.c:2178. Signed-off-by: Yehuda Sadeh Signed-off-by: Sage Weil --- fs/ceph/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 7505b4f1f59..159f588ca16 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -311,7 +311,7 @@ more: req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.max_entries = cpu_to_le32(max_entries); - req->r_num_caps = max_entries; + req->r_num_caps = max_entries + 1; err = ceph_mdsc_do_request(mdsc, NULL, req); if (err < 0) { ceph_mdsc_put_request(req); -- cgit v1.2.3-70-g09d2 From a6a5349d17f2a5c37079826f1a1474c3d08c6b53 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 13 Apr 2010 14:07:07 -0700 Subject: ceph: use separate class for ceph sockets' sk_lock Use a separate class for ceph sockets to prevent lockdep confusion. Because ceph sockets only get passed kernel pointers, there is no dependency from sk_lock -> mmap_sem. If we share the same class as other sockets, lockdep detects a circular dependency from mmap_sem (page fault) -> fs mutex -> sk_lock -> mmap_sem because dependencies are noted from both ceph and user contexts. Using a separate class prevents the sk_lock(ceph) -> mmap_sem dependency and makes lockdep happy. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index f35b4945a9c..5c75d5d32b2 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -29,6 +29,10 @@ static char tag_msg = CEPH_MSGR_TAG_MSG; static char tag_ack = CEPH_MSGR_TAG_ACK; static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE; +#ifdef CONFIG_LOCKDEP +static struct lock_class_key socket_class; +#endif + static void queue_con(struct ceph_connection *con); static void con_work(struct work_struct *); @@ -227,6 +231,10 @@ static struct socket *ceph_tcp_connect(struct ceph_connection *con) con->sock = sock; sock->sk->sk_allocation = GFP_NOFS; +#ifdef CONFIG_LOCKDEP + lockdep_set_class(&sock->sk->sk_lock, &socket_class); +#endif + set_sock_callbacks(sock, con); dout("connect %s\n", pr_addr(&con->peer_addr.in_addr)); -- cgit v1.2.3-70-g09d2 From 1f1b0008e8dd1930d6e89522c70f4a438374302a Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Thu, 15 Apr 2010 08:03:57 +0200 Subject: [LogFS] Prevent mempool_destroy NULL pointer dereference It would probably be better to just accept NULL pointers in mempool_destroy(). But for the current -rc series let's keep things simple. This patch was lost in the cracks for a while. Kevin Cernekee had to rediscover the problem and send a similar patch because of it. :( Signed-off-by: Joern Engel --- fs/logfs/logfs.h | 6 ++++++ fs/logfs/readwrite.c | 6 ++---- fs/logfs/segment.c | 2 +- fs/logfs/super.c | 4 ++-- 4 files changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index c9929eed80b..0a3df1a0c93 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -727,4 +727,10 @@ static inline struct logfs_area *get_area(struct super_block *sb, return logfs_super(sb)->s_area[(__force u8)gc_level]; } +static inline void logfs_mempool_destroy(mempool_t *pool) +{ + if (pool) + mempool_destroy(pool); +} + #endif diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 7e0c39c4971..aca6c56a107 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -2243,8 +2243,6 @@ void logfs_cleanup_rw(struct super_block *sb) struct logfs_super *super = logfs_super(sb); destroy_meta_inode(super->s_segfile_inode); - if (super->s_block_pool) - mempool_destroy(super->s_block_pool); - if (super->s_shadow_pool) - mempool_destroy(super->s_shadow_pool); + logfs_mempool_destroy(super->s_block_pool); + logfs_mempool_destroy(super->s_shadow_pool); } diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 02db22ebbf1..8c82fe05d3e 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -912,7 +912,7 @@ err: for (i--; i >= 0; i--) free_area(super->s_area[i]); free_area(super->s_journal_area); - mempool_destroy(super->s_alias_pool); + logfs_mempool_destroy(super->s_alias_pool); return -ENOMEM; } diff --git a/fs/logfs/super.c b/fs/logfs/super.c index d6e1f4fc311..d4531eb46d0 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -517,8 +517,8 @@ static void logfs_kill_sb(struct super_block *sb) if (super->s_erase_page) __free_page(super->s_erase_page); super->s_devops->put_device(sb); - mempool_destroy(super->s_btree_pool); - mempool_destroy(super->s_alias_pool); + logfs_mempool_destroy(super->s_btree_pool); + logfs_mempool_destroy(super->s_alias_pool); kfree(super); log_super("LogFS: Finished unmounting\n"); } -- cgit v1.2.3-70-g09d2 From 79681842e160c3211eeeb47ea31b061038d1e41e Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 16 Apr 2010 13:59:25 +0800 Subject: ocfs2: Reset status if we want to restart file extension. In __ocfs2_extend_allocation, we will restart our file extension if ((!status) && restart_func). But there is a bug that the status is still left as -EGAIN. This is really an old bug, but it is masked by the return value of ocfs2_journal_dirty. So it show up when we make ocfs2_journal_dirty void. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2b4235c5831..20e0ee58dd3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -684,6 +684,7 @@ restarted_transaction: if (why == RESTART_META) { mlog(0, "restarting function.\n"); restart_func = 1; + status = 0; } else { BUG_ON(why != RESTART_TRANS); -- cgit v1.2.3-70-g09d2 From 2b0b39517d1af5294128dbc2fd7ed39c8effa540 Mon Sep 17 00:00:00 2001 From: Bill Pemberton Date: Fri, 16 Apr 2010 08:01:20 -0500 Subject: jfs: fix diAllocExt error in resizing filesystem Resizing the filesystem would result in an diAllocExt error in some instances because changes in bmp->db_agsize would not get noticed if goto extendBmap was called. Signed-off-by: Bill Pemberton Signed-off-by: Dave Kleikamp Cc: jfs-discussion@lists.sourceforge.net Cc: linux-kernel@vger.kernel.org --- fs/jfs/resize.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jfs/resize.c b/fs/jfs/resize.c index 7f24a0bb08c..1aba0039f1c 100644 --- a/fs/jfs/resize.c +++ b/fs/jfs/resize.c @@ -81,6 +81,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) struct inode *iplist[1]; struct jfs_superblock *j_sb, *j_sb2; uint old_agsize; + int agsizechanged = 0; struct buffer_head *bh, *bh2; /* If the volume hasn't grown, get out now */ @@ -333,6 +334,9 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) */ if ((rc = dbExtendFS(ipbmap, XAddress, nblocks))) goto error_out; + + agsizechanged |= (bmp->db_agsize != old_agsize); + /* * the map now has extended to cover additional nblocks: * dn_mapsize = oldMapsize + nblocks; @@ -432,7 +436,7 @@ int jfs_extendfs(struct super_block *sb, s64 newLVSize, int newLogSize) * will correctly identify the new ag); */ /* if new AG size the same as old AG size, done! */ - if (bmp->db_agsize != old_agsize) { + if (agsizechanged) { if ((rc = diExtendFS(ipimap, ipbmap))) goto error_out; -- cgit v1.2.3-70-g09d2 From c7f2e1f0ac142a657a1de00d404e1c8345b20598 Mon Sep 17 00:00:00 2001 From: Dmitry Monakhov Date: Fri, 16 Apr 2010 08:05:50 -0500 Subject: jfs: add jfs specific ->setattr call generic setattr not longer responsible for quota transfer. use jfs_setattr for all jfs's inodes. Signed-off-by: Dmitry Monakhov Signed-off-by: Dave Kleikamp --- fs/jfs/inode.c | 2 +- fs/jfs/jfs_inode.h | 1 + fs/jfs/namei.c | 4 ++-- fs/jfs/symlink.c | 14 +++++++++++++- 4 files changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index b2ae190a77b..97cd11954d0 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -60,7 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &jfs_aops; } else { - inode->i_op = &jfs_symlink_inode_operations; + inode->i_op = &jfs_fast_symlink_inode_operations; /* * The inline data should be null-terminated, but * don't let on-disk corruption crash the kernel diff --git a/fs/jfs/jfs_inode.h b/fs/jfs/jfs_inode.h index 1eff7db34d6..f8b56b238bb 100644 --- a/fs/jfs/jfs_inode.h +++ b/fs/jfs/jfs_inode.h @@ -47,5 +47,6 @@ extern const struct file_operations jfs_dir_operations; extern const struct inode_operations jfs_file_inode_operations; extern const struct file_operations jfs_file_operations; extern const struct inode_operations jfs_symlink_inode_operations; +extern const struct inode_operations jfs_fast_symlink_inode_operations; extern const struct dentry_operations jfs_ci_dentry_operations; #endif /* _H_JFS_INODE */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index c79a4270f08..114e6007110 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -946,7 +946,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, */ if (ssize <= IDATASIZE) { - ip->i_op = &jfs_symlink_inode_operations; + ip->i_op = &jfs_fast_symlink_inode_operations; i_fastsymlink = JFS_IP(ip)->i_inline; memcpy(i_fastsymlink, name, ssize); @@ -968,7 +968,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, else { jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); - ip->i_op = &page_symlink_inode_operations; + ip->i_op = &jfs_symlink_inode_operations; ip->i_mapping->a_ops = &jfs_aops; /* diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 4af1a05aad0..205b946d8e0 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -29,9 +29,21 @@ static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) return NULL; } -const struct inode_operations jfs_symlink_inode_operations = { +const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = jfs_follow_link, + .setattr = jfs_setattr, + .setxattr = jfs_setxattr, + .getxattr = jfs_getxattr, + .listxattr = jfs_listxattr, + .removexattr = jfs_removexattr, +}; + +const struct inode_operations jfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, .listxattr = jfs_listxattr, -- cgit v1.2.3-70-g09d2 From b6f8dd49dbdbfa60a33bba3d4b766fe341109b4b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 13 Apr 2010 15:06:44 +1000 Subject: xfs: ensure that sync updates the log tail correctly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Updates to the VFS layer removed an extra ->sync_fs call into the filesystem during the sync process (from the quota code). Unfortunately the sync code was unknowingly relying on this call to make sure metadata buffers were flushed via a xfs_buftarg_flush() call to move the tail of the log forward in memory before the final transactions of the sync process were issued. As a result, the old code would write a very recent log tail value to the log by the end of the sync process, and so a subsequent crash would leave nothing for log recovery to do. Hence in qa test 182, log recovery only replayed a small handle for inode fsync transactions in this case. However, with the removal of the extra ->sync_fs call, the log tail was now not moved forward with the inode fsync transactions near the end of the sync procese the first (and only) buftarg flush occurred after these transactions went to disk. The result is that log recovery now sees a large number of transactions for metadata that is already on disk. This usually isn't a problem, but when the transactions include inode chunk allocation, the inode create transactions and all subsequent changes are replayed as we cannt rely on what is on disk is valid. As a result, if the inode was written and contains unlogged changes, the unlogged changes are lost, thereby violating sync semantics. The fix is to always issue a transaction after the buftarg flush occurs is the log iѕ not idle or covered. This results in a dummy transaction being written that contains the up-to-date log tail value, which will be very recent. Indeed, it will be at least as recent as the old code would have left on disk, so log recovery will behave exactly as it used to in this situation. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Alex Elder --- fs/xfs/xfs_log.c | 38 ++++++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index e8fba92d7cd..2be01913628 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -745,9 +745,16 @@ xfs_log_move_tail(xfs_mount_t *mp, /* * Determine if we have a transaction that has gone to disk - * that needs to be covered. Log activity needs to be idle (no AIL and - * nothing in the iclogs). And, we need to be in the right state indicating - * something has gone out. + * that needs to be covered. To begin the transition to the idle state + * firstly the log needs to be idle (no AIL and nothing in the iclogs). + * If we are then in a state where covering is needed, the caller is informed + * that dummy transactions are required to move the log into the idle state. + * + * Because this is called as part of the sync process, we should also indicate + * that dummy transactions should be issued in anything but the covered or + * idle states. This ensures that the log tail is accurately reflected in + * the log at the end of the sync, hence if a crash occurrs avoids replay + * of transactions where the metadata is already on disk. */ int xfs_log_need_covered(xfs_mount_t *mp) @@ -759,17 +766,24 @@ xfs_log_need_covered(xfs_mount_t *mp) return 0; spin_lock(&log->l_icloglock); - if (((log->l_covered_state == XLOG_STATE_COVER_NEED) || - (log->l_covered_state == XLOG_STATE_COVER_NEED2)) - && !xfs_trans_ail_tail(log->l_ailp) - && xlog_iclogs_empty(log)) { - if (log->l_covered_state == XLOG_STATE_COVER_NEED) - log->l_covered_state = XLOG_STATE_COVER_DONE; - else { - ASSERT(log->l_covered_state == XLOG_STATE_COVER_NEED2); - log->l_covered_state = XLOG_STATE_COVER_DONE2; + switch (log->l_covered_state) { + case XLOG_STATE_COVER_DONE: + case XLOG_STATE_COVER_DONE2: + case XLOG_STATE_COVER_IDLE: + break; + case XLOG_STATE_COVER_NEED: + case XLOG_STATE_COVER_NEED2: + if (!xfs_trans_ail_tail(log->l_ailp) && + xlog_iclogs_empty(log)) { + if (log->l_covered_state == XLOG_STATE_COVER_NEED) + log->l_covered_state = XLOG_STATE_COVER_DONE; + else + log->l_covered_state = XLOG_STATE_COVER_DONE2; } + /* FALLTHRU */ + default: needed = 1; + break; } spin_unlock(&log->l_icloglock); return needed; -- cgit v1.2.3-70-g09d2 From f1d486a3617a2f620b31224e4ace1496c4627e39 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 13 Apr 2010 15:06:45 +1000 Subject: xfs: don't warn on EAGAIN in inode reclaim Any inode reclaim flush that returns EAGAIN will result in the inode reclaim being attempted again later. There is no need to issue a warning into the logs about this situation. Signed-off-by: Dave Chinner Reviewed-by: Alex Elder Signed-off-by: Alex Elder --- fs/xfs/linux-2.6/xfs_sync.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 05cd85317f6..fd969821575 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -820,10 +820,10 @@ xfs_reclaim_inode( * call into reclaim to find it in a clean state instead of waiting for * it now. We also don't return errors here - if the error is transient * then the next reclaim pass will flush the inode, and if the error - * is permanent then the next sync reclaim will relcaim the inode and + * is permanent then the next sync reclaim will reclaim the inode and * pass on the error. */ - if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { + if (error && error != EAGAIN && !XFS_FORCED_SHUTDOWN(ip->i_mount)) { xfs_fs_cmn_err(CE_WARN, ip->i_mount, "inode 0x%llx background reclaim flush failed with %d", (long long)ip->i_ino, error); -- cgit v1.2.3-70-g09d2 From b8639077abf034824046ed09e779b74c4393031f Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Sat, 17 Apr 2010 19:54:27 +0200 Subject: [LogFS] Set s_bdi Since 32a88aa1 sync() was turned into a NOP for logfs. Worse, sync() would not return an error, giving the illusion that writeout had actually happened. Afaics jffs2 was broken as well. Signed-off-by: Joern Engel --- fs/logfs/super.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/logfs/super.c b/fs/logfs/super.c index d4531eb46d0..dacce3a917a 100644 --- a/fs/logfs/super.c +++ b/fs/logfs/super.c @@ -11,6 +11,7 @@ */ #include "logfs.h" #include +#include #include #include #include @@ -136,6 +137,10 @@ static int logfs_sb_set(struct super_block *sb, void *_super) sb->s_fs_info = super; sb->s_mtd = super->s_mtd; sb->s_bdev = super->s_bdev; + if (sb->s_bdev) + sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; + if (sb->s_mtd) + sb->s_bdi = sb->s_mtd->backing_dev_info; return 0; } -- cgit v1.2.3-70-g09d2 From 3a60a1686f0d51c99bd0df8ac93050fb6dfce647 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Mon, 22 Mar 2010 00:41:35 -0500 Subject: eCryptfs: Decrypt symlink target for stat size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Create a getattr handler for eCryptfs symlinks that is capable of reading the lower target and decrypting its path. Prior to this patch, a stat's st_size field would represent the strlen of the encrypted path, while readlink() would return the strlen of the decrypted path. This could lead to confusion in some userspace applications, since the two values should be equal. https://bugs.launchpad.net/bugs/524919 Reported-by: Loïc Minier Cc: stable@kernel.org Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 100 +++++++++++++++++++++++++++------------------------- 1 file changed, 52 insertions(+), 48 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index ddbd096c740..605f514f3b4 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -648,38 +648,17 @@ out_lock: return rc; } -static int -ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +static int ecryptfs_readlink_lower(struct dentry *dentry, char **buf, + size_t *bufsiz) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); char *lower_buf; - size_t lower_bufsiz; - struct dentry *lower_dentry; - struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - char *plaintext_name; - size_t plaintext_name_size; + size_t lower_bufsiz = PATH_MAX; mm_segment_t old_fs; int rc; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op->readlink) { - rc = -EINVAL; - goto out; - } - mount_crypt_stat = &ecryptfs_superblock_to_private( - dentry->d_sb)->mount_crypt_stat; - /* - * If the lower filename is encrypted, it will result in a significantly - * longer name. If needed, truncate the name after decode and decrypt. - */ - if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) - lower_bufsiz = PATH_MAX; - else - lower_bufsiz = bufsiz; - /* Released in this function */ lower_buf = kmalloc(lower_bufsiz, GFP_KERNEL); - if (lower_buf == NULL) { - printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc [%zd] bytes\n", __func__, lower_bufsiz); + if (!lower_buf) { rc = -ENOMEM; goto out; } @@ -689,29 +668,31 @@ ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) (char __user *)lower_buf, lower_bufsiz); set_fs(old_fs); - if (rc >= 0) { - rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, - &plaintext_name_size, - dentry, lower_buf, - rc); - if (rc) { - printk(KERN_ERR "%s: Error attempting to decode and " - "decrypt filename; rc = [%d]\n", __func__, - rc); - goto out_free_lower_buf; - } - /* Check for bufsiz <= 0 done in sys_readlinkat() */ - rc = copy_to_user(buf, plaintext_name, - min((size_t) bufsiz, plaintext_name_size)); - if (rc) - rc = -EFAULT; - else - rc = plaintext_name_size; - kfree(plaintext_name); - fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); - } -out_free_lower_buf: + if (rc < 0) + goto out; + lower_bufsiz = rc; + rc = ecryptfs_decode_and_decrypt_filename(buf, bufsiz, dentry, + lower_buf, lower_bufsiz); +out: kfree(lower_buf); + return rc; +} + +static int +ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + char *kbuf; + size_t kbufsiz, copied; + int rc; + + rc = ecryptfs_readlink_lower(dentry, &kbuf, &kbufsiz); + if (rc) + goto out; + copied = min_t(size_t, bufsiz, kbufsiz); + rc = copy_to_user(buf, kbuf, copied) ? -EFAULT : copied; + kfree(kbuf); + fsstack_copy_attr_atime(dentry->d_inode, + ecryptfs_dentry_to_lower(dentry)->d_inode); out: return rc; } @@ -1016,6 +997,28 @@ out: return rc; } +int ecryptfs_getattr_link(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + int rc = 0; + + mount_crypt_stat = &ecryptfs_superblock_to_private( + dentry->d_sb)->mount_crypt_stat; + generic_fillattr(dentry->d_inode, stat); + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + char *target; + size_t targetsiz; + + rc = ecryptfs_readlink_lower(dentry, &target, &targetsiz); + if (!rc) { + kfree(target); + stat->size = targetsiz; + } + } + return rc; +} + int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -1133,6 +1136,7 @@ const struct inode_operations ecryptfs_symlink_iops = { .put_link = ecryptfs_put_link, .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr_link, .setxattr = ecryptfs_setxattr, .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, -- cgit v1.2.3-70-g09d2 From cfce08c6bdfb20ade979284e55001ca1f100ed51 Mon Sep 17 00:00:00 2001 From: Christian Pulvermacher Date: Tue, 23 Mar 2010 11:51:38 -0500 Subject: ecryptfs: fix error code for missing xattrs in lower fs If the lower file system driver has extended attributes disabled, ecryptfs' own access functions return -ENOSYS instead of -EOPNOTSUPP. This breaks execution of programs in the ecryptfs mount, since the kernel expects the latter error when checking for security capabilities in xattrs. Signed-off-by: Christian Pulvermacher Cc: stable@kernel.org Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 605f514f3b4..efd4f47d530 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1043,7 +1043,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->setxattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1061,7 +1061,7 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, int rc = 0; if (!lower_dentry->d_inode->i_op->getxattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1088,7 +1088,7 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->listxattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); @@ -1105,7 +1105,7 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) lower_dentry = ecryptfs_dentry_to_lower(dentry); if (!lower_dentry->d_inode->i_op->removexattr) { - rc = -ENOSYS; + rc = -EOPNOTSUPP; goto out; } mutex_lock(&lower_dentry->d_inode->i_mutex); -- cgit v1.2.3-70-g09d2 From 133b8f9d632cc23715c6d72d1c5ac449e054a12a Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 19 Mar 2010 15:35:46 -0400 Subject: ecryptfs: fix use with tmpfs by removing d_drop from ecryptfs_destroy_inode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since tmpfs has no persistent storage, it pins all its dentries in memory so they have d_count=1 when other file systems would have d_count=0. ->lookup is only used to create new dentries. If the caller doesn't instantiate it, it's freed immediately at dput(). ->readdir reads directly from the dcache and depends on the dentries being hashed. When an ecryptfs mount is mounted, it associates the lower file and dentry with the ecryptfs files as they're accessed. When it's umounted and destroys all the in-memory ecryptfs inodes, it fput's the lower_files and d_drop's the lower_dentries. Commit 4981e081 added this and a d_delete in 2008 and several months later commit caeeeecf removed the d_delete. I believe the d_drop() needs to be removed as well. The d_drop effectively hides any file that has been accessed via ecryptfs from the underlying tmpfs since it depends on it being hashed for it to be accessible. I've removed the d_drop on my development node and see no ill effects with basic testing on both tmpfs and persistent storage. As a side effect, after ecryptfs d_drops the dentries on tmpfs, tmpfs BUGs on umount. This is due to the dentries being unhashed. tmpfs->kill_sb is kill_litter_super which calls d_genocide to drop the reference pinning the dentry. It skips unhashed and negative dentries, but shrink_dcache_for_umount_subtree doesn't. Since those dentries still have an elevated d_count, we get a BUG(). This patch removes the d_drop call and fixes both issues. This issue was reported at: https://bugzilla.novell.com/show_bug.cgi?id=567887 Reported-by: Árpád Bíró Signed-off-by: Jeff Mahoney Cc: Dustin Kirkland Cc: stable@kernel.org Signed-off-by: Tyler Hicks --- fs/ecryptfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index b15a43a80ab..1a037f77aa5 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -85,7 +85,6 @@ static void ecryptfs_destroy_inode(struct inode *inode) if (lower_dentry->d_inode) { fput(inode_info->lower_file); inode_info->lower_file = NULL; - d_drop(lower_dentry); } } ecryptfs_destroy_crypt_stat(&inode_info->crypt_stat); -- cgit v1.2.3-70-g09d2 From 3a8380c0754a7972668a46f645930910e304095c Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Tue, 23 Mar 2010 18:09:02 -0500 Subject: eCryptfs: Copy lower directory inode times and size on link The timestamps and size of a lower inode involved in a link() call was being copied to the upper parent inode. Instead, we should be copying lower parent inode's timestamps and size to the upper parent inode. I discovered this bug using the POSIX test suite at Tuxera. Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index efd4f47d530..15b424837c1 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -456,8 +456,8 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb, 0); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, lower_new_dentry->d_inode); - fsstack_copy_inode_size(dir, lower_new_dentry->d_inode); + fsstack_copy_attr_times(dir, lower_dir_dentry->d_inode); + fsstack_copy_inode_size(dir, lower_dir_dentry->d_inode); old_dentry->d_inode->i_nlink = ecryptfs_inode_to_lower(old_dentry->d_inode)->i_nlink; i_size_write(new_dentry->d_inode, file_size_save); -- cgit v1.2.3-70-g09d2 From 9f37622f897a90ad3c3da5c14d94d8f3ffc62b70 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Thu, 25 Mar 2010 11:16:56 -0500 Subject: eCryptfs: Turn lower lookup error messages into debug messages Vaugue warnings about ENAMETOOLONG errors when looking up an encrypted file name have caused many users to become concerned about their data. Since this is a rather harmless condition, I'm moving this warning to only be printed when the ecryptfs_verbosity module param is 1. Signed-off-by: Tyler Hicks --- fs/ecryptfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 15b424837c1..b980fce984c 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -388,9 +388,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " - "lower_dentry = [%s]\n", __func__, rc, - ecryptfs_dentry->d_name.name); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); goto out_d_drop; } if (lower_dentry->d_inode) @@ -417,9 +417,9 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mutex_unlock(&lower_dir_dentry->d_inode->i_mutex); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); - printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " - "lower_dentry = [%s]\n", __func__, rc, - encrypted_and_encoded_name); + ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + "[%d] on lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); goto out_d_drop; } lookup_and_interpose: -- cgit v1.2.3-70-g09d2 From 62af9b520513d78484f22f874916dfacbc889ce0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 19 Apr 2010 16:47:20 +0200 Subject: quota: Convert __DQUOT_PARANOIA symbol to standard config option Make __DQUOT_PARANOIA define from the old days a standard config option and turn it off by default. This gets rid of a quota warning about writes before quota is turned on for systems with ext4 root filesystem. Currently there's no way to legally solve this because /etc/mtab has to be written before quota is turned on on most systems. Signed-off-by: Jan Kara --- fs/quota/Kconfig | 8 ++++++++ fs/quota/dquot.c | 16 +++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/quota/Kconfig b/fs/quota/Kconfig index dad7fb247dd..3e21b1e2ad3 100644 --- a/fs/quota/Kconfig +++ b/fs/quota/Kconfig @@ -33,6 +33,14 @@ config PRINT_QUOTA_WARNING Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +config QUOTA_DEBUG + bool "Additional quota sanity checks" + depends on QUOTA + default n + help + If you say Y here, quota subsystem will perform some additional + sanity checks of quota internal structures. If unsure, say N. + # Generic support for tree structured quota files. Selected when needed. config QUOTA_TREE tristate diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index a0a9405b202..788b5802a7c 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -80,8 +80,6 @@ #include -#define __DQUOT_PARANOIA - /* * There are three quota SMP locks. dq_list_lock protects all lists with quotas * and quota formats, dqstats structure containing statistics about the lists @@ -695,7 +693,7 @@ void dqput(struct dquot *dquot) if (!dquot) return; -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { printk("VFS: dqput: trying to free free dquot\n"); printk("VFS: device %s, dquot of %s %d\n", @@ -748,7 +746,7 @@ we_slept: goto we_slept; } atomic_dec(&dquot->dq_count); -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG /* sanity check */ BUG_ON(!list_empty(&dquot->dq_free)); #endif @@ -845,7 +843,7 @@ we_slept: dquot = NULL; goto out; } -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif out: @@ -874,7 +872,7 @@ static int dqinit_needed(struct inode *inode, int type) static void add_dquot_ref(struct super_block *sb, int type) { struct inode *inode, *old_inode = NULL; -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif @@ -882,7 +880,7 @@ static void add_dquot_ref(struct super_block *sb, int type) list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW)) continue; -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif @@ -907,7 +905,7 @@ static void add_dquot_ref(struct super_block *sb, int type) spin_unlock(&inode_lock); iput(old_inode); -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened before quota" " was turned on thus quota information is probably " @@ -940,7 +938,7 @@ static int remove_inode_dquot_ref(struct inode *inode, int type, inode->i_dquot[type] = NULL; if (dquot) { if (dqput_blocks(dquot)) { -#ifdef __DQUOT_PARANOIA +#ifdef CONFIG_QUOTA_DEBUG if (atomic_read(&dquot->dq_count) != 1) printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); #endif -- cgit v1.2.3-70-g09d2 From b6349ac89eacb813f6963f7263da05bc3f483351 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Tue, 20 Apr 2010 21:44:10 +0200 Subject: [LogFS] Split large truncated into smaller chunks Truncate would do an almost limitless amount of work without invoking the garbage collector in between. Split it up into more manageable, though still large, chunks. Signed-off-by: Joern Engel --- fs/logfs/readwrite.c | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index aca6c56a107..7e3a1e5fd76 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -1837,19 +1837,37 @@ static int __logfs_truncate(struct inode *inode, u64 size) return logfs_truncate_direct(inode, size); } -int logfs_truncate(struct inode *inode, u64 size) +/* + * Truncate, by changing the segment file, can consume a fair amount + * of resources. So back off from time to time and do some GC. + * 8 or 2048 blocks should be well within safety limits even if + * every single block resided in a different segment. + */ +#define TRUNCATE_STEP (8 * 1024 * 1024) +int logfs_truncate(struct inode *inode, u64 target) { struct super_block *sb = inode->i_sb; - int err; + u64 size = i_size_read(inode); + int err = 0; - logfs_get_wblocks(sb, NULL, 1); - err = __logfs_truncate(inode, size); - if (!err) - err = __logfs_write_inode(inode, 0); - logfs_put_wblocks(sb, NULL, 1); + size = ALIGN(size, TRUNCATE_STEP); + while (size > target) { + if (size > TRUNCATE_STEP) + size -= TRUNCATE_STEP; + else + size = 0; + if (size < target) + size = target; + + logfs_get_wblocks(sb, NULL, 1); + err = __logfs_truncate(inode, target); + if (!err) + err = __logfs_write_inode(inode, 0); + logfs_put_wblocks(sb, NULL, 1); + } if (!err) - err = vmtruncate(inode, size); + err = vmtruncate(inode, target); /* I don't trust error recovery yet. */ WARN_ON(err); -- cgit v1.2.3-70-g09d2 From b90f687018e6d6c77d981b09203780f7001407e5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 20 Apr 2010 16:51:59 -0400 Subject: ext4: Issue the discard operation *before* releasing the blocks to be reused Otherwise, we can end up having data corruption because the blocks could get reused and then discarded! https://bugzilla.kernel.org/show_bug.cgi?id=15579 Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 54df209d2ee..e5ab41b559c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2534,6 +2534,17 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); + if (test_opt(sb, DISCARD)) { + ext4_fsblk_t discard_block; + + discard_block = entry->start_blk + + ext4_group_first_block_no(sb, entry->group); + trace_ext4_discard_blocks(sb, + (unsigned long long)discard_block, + entry->count); + sb_issue_discard(sb, discard_block, entry->count); + } + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); @@ -2555,16 +2566,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) page_cache_release(e4b.bd_bitmap_page); } ext4_unlock_group(sb, entry->group); - if (test_opt(sb, DISCARD)) { - ext4_fsblk_t discard_block; - - discard_block = entry->start_blk + - ext4_group_first_block_no(sb, entry->group); - trace_ext4_discard_blocks(sb, - (unsigned long long)discard_block, - entry->count); - sb_issue_discard(sb, discard_block, entry->count); - } kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); } -- cgit v1.2.3-70-g09d2 From d7dfee3f5db5575b1d838744559c3c9bb351f74f Mon Sep 17 00:00:00 2001 From: Jun Sun Date: Thu, 31 Dec 2009 17:28:52 -0800 Subject: uclinux: error message when FLAT reloc symbol is invalid, v2 This patch fixes a cosmetic error in printk. Text segment and data/bss segment are allocated from two different areas. It is not meaningful to give the diff between them in the error reporting messages. Signed-off-by: Jun Sun Signed-off-by: Greg Ungerer --- fs/binfmt_flat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index e0e769bdca5..49566c1687d 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -355,7 +355,7 @@ calc_reloc(unsigned long r, struct lib_info *p, int curid, int internalp) if (!flat_reloc_valid(r, start_brk - start_data + text_len)) { printk("BINFMT_FLAT: reloc outside program 0x%x (0 - 0x%x/0x%x)", - (int) r,(int)(start_brk-start_code),(int)text_len); + (int) r,(int)(start_brk-start_data+text_len),(int)text_len); goto failed; } -- cgit v1.2.3-70-g09d2 From 083fd8b21a13742b37ab347089c73f895a896672 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Apr 2010 12:01:23 +0100 Subject: AFS: Don't pass error value to page_cache_release() in error handling In the error handling in afs_mntpt_do_automount(), we pass an error pointer to page_cache_release() if read_mapping_page() failed. Instead, we should extend the gotos around the error handling we don't need. Reported-by: Dan Carpenter Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/afs/mntpt.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 5e813a816ce..b3feddc4f7d 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -138,9 +138,9 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) { struct afs_super_info *super; struct vfsmount *mnt; - struct page *page = NULL; + struct page *page; size_t size; - char *buf, *devname = NULL, *options = NULL; + char *buf, *devname, *options; int ret; _enter("{%s}", mntpt->d_name.name); @@ -150,22 +150,22 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) ret = -EINVAL; size = mntpt->d_inode->i_size; if (size > PAGE_SIZE - 1) - goto error; + goto error_no_devname; ret = -ENOMEM; devname = (char *) get_zeroed_page(GFP_KERNEL); if (!devname) - goto error; + goto error_no_devname; options = (char *) get_zeroed_page(GFP_KERNEL); if (!options) - goto error; + goto error_no_options; /* read the contents of the AFS special symlink */ page = read_mapping_page(mntpt->d_inode->i_mapping, 0, NULL); if (IS_ERR(page)) { ret = PTR_ERR(page); - goto error; + goto error_no_page; } ret = -EIO; @@ -196,12 +196,12 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt) return mnt; error: - if (page) - page_cache_release(page); - if (devname) - free_page((unsigned long) devname); - if (options) - free_page((unsigned long) options); + page_cache_release(page); +error_no_page: + free_page((unsigned long) options); +error_no_options: + free_page((unsigned long) devname); +error_no_devname: _leave(" = %d", ret); return ERR_PTR(ret); } -- cgit v1.2.3-70-g09d2 From 0ed07ddb56d1348e5ce33f3b8de20d730351983a Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 11:42:00 +0200 Subject: 9p: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/9p/v9fs.c | 10 ++++++++++ fs/9p/v9fs.h | 2 ++ fs/9p/vfs_super.c | 1 + 3 files changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 5c5bc848007..f8b86e92cd6 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -238,6 +238,13 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return ERR_PTR(-ENOMEM); } + rc = bdi_setup_and_register(&v9ses->bdi, "9p", BDI_CAP_MAP_COPY); + if (rc) { + __putname(v9ses->aname); + __putname(v9ses->uname); + return ERR_PTR(rc); + } + spin_lock(&v9fs_sessionlist_lock); list_add(&v9ses->slist, &v9fs_sessionlist); spin_unlock(&v9fs_sessionlist_lock); @@ -301,6 +308,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, return fid; error: + bdi_destroy(&v9ses->bdi); return ERR_PTR(retval); } @@ -326,6 +334,8 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) __putname(v9ses->uname); __putname(v9ses->aname); + bdi_destroy(&v9ses->bdi); + spin_lock(&v9fs_sessionlist_lock); list_del(&v9ses->slist); spin_unlock(&v9fs_sessionlist_lock); diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a0a8d3dd136..bec4d0bcb45 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -20,6 +20,7 @@ * Boston, MA 02111-1301 USA * */ +#include /** * enum p9_session_flags - option flags for each 9P session @@ -102,6 +103,7 @@ struct v9fs_session_info { u32 uid; /* if ACCESS_SINGLE, the uid that has access */ struct p9_client *clnt; /* 9p client */ struct list_head slist; /* list of sessions registered with v9fs */ + struct backing_dev_info bdi; }; struct p9_fid *v9fs_session_init(struct v9fs_session_info *, const char *, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 491108bd6e0..806da5d3b3a 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -77,6 +77,7 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_blocksize = 1 << sb->s_blocksize_bits; sb->s_magic = V9FS_MAGIC; sb->s_op = &v9fs_super_ops; + sb->s_bdi = &v9ses->bdi; sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; -- cgit v1.2.3-70-g09d2 From e1da0222753a2322d76c97fc02396fb83143c7ac Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 11:58:18 +0200 Subject: afs: add bdi backing to mount session. This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/afs/internal.h | 2 ++ fs/afs/super.c | 1 + fs/afs/volume.c | 7 +++++++ 3 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/afs/internal.h b/fs/afs/internal.h index c54dad4e606..a10f2582844 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -19,6 +19,7 @@ #include #include #include +#include #include "afs.h" #include "afs_vl.h" @@ -313,6 +314,7 @@ struct afs_volume { unsigned short rjservers; /* number of servers discarded due to -ENOMEDIUM */ struct afs_server *servers[8]; /* servers on which volume resides (ordered) */ struct rw_semaphore server_sem; /* lock for accessing current server */ + struct backing_dev_info bdi; }; /* diff --git a/fs/afs/super.c b/fs/afs/super.c index 14f6431598a..e932e5a3a0c 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -311,6 +311,7 @@ static int afs_fill_super(struct super_block *sb, void *data) sb->s_magic = AFS_FS_MAGIC; sb->s_op = &afs_super_ops; sb->s_fs_info = as; + sb->s_bdi = &as->volume->bdi; /* allocate the root inode and dentry */ fid.vid = as->volume->vid; diff --git a/fs/afs/volume.c b/fs/afs/volume.c index a353e69e239..401eeb21869 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -106,6 +106,10 @@ struct afs_volume *afs_volume_lookup(struct afs_mount_params *params) volume->cell = params->cell; volume->vid = vlocation->vldb.vid[params->type]; + ret = bdi_setup_and_register(&volume->bdi, "afs", BDI_CAP_MAP_COPY); + if (ret) + goto error_bdi; + init_rwsem(&volume->server_sem); /* look up all the applicable server records */ @@ -151,6 +155,8 @@ error: return ERR_PTR(ret); error_discard: + bdi_destroy(&volume->bdi); +error_bdi: up_write(¶ms->cell->vl_sem); for (loop = volume->nservers - 1; loop >= 0; loop--) @@ -200,6 +206,7 @@ void afs_put_volume(struct afs_volume *volume) for (loop = volume->nservers - 1; loop >= 0; loop--) afs_put_server(volume->servers[loop]); + bdi_destroy(&volume->bdi); kfree(volume); _leave(" [destroyed]"); -- cgit v1.2.3-70-g09d2 From 8044f7f468469c80031611206d554f86fcdfe704 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:09:48 +0200 Subject: cifs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/cifs/cifs_fs_sb.h | 3 +++ fs/cifs/cifsfs.c | 10 ++++++++++ 2 files changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 4797787c6a4..246a167cb91 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -18,6 +18,8 @@ #ifndef _CIFS_FS_SB_H #define _CIFS_FS_SB_H +#include + #define CIFS_MOUNT_NO_PERM 1 /* do not do client vfs_perm check */ #define CIFS_MOUNT_SET_UID 2 /* set current's euid in create etc. */ #define CIFS_MOUNT_SERVER_INUM 4 /* inode numbers from uniqueid from server */ @@ -50,5 +52,6 @@ struct cifs_sb_info { #ifdef CONFIG_CIFS_DFS_UPCALL char *mountdata; /* mount options received at mount time */ #endif + struct backing_dev_info bdi; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index ded66be6597..ad235d604a0 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -103,6 +103,12 @@ cifs_read_super(struct super_block *sb, void *data, if (cifs_sb == NULL) return -ENOMEM; + rc = bdi_setup_and_register(&cifs_sb->bdi, "cifs", BDI_CAP_MAP_COPY); + if (rc) { + kfree(cifs_sb); + return rc; + } + #ifdef CONFIG_CIFS_DFS_UPCALL /* copy mount params to sb for use in submounts */ /* BB: should we move this after the mount so we @@ -115,6 +121,7 @@ cifs_read_super(struct super_block *sb, void *data, int len = strlen(data); cifs_sb->mountdata = kzalloc(len + 1, GFP_KERNEL); if (cifs_sb->mountdata == NULL) { + bdi_destroy(&cifs_sb->bdi); kfree(sb->s_fs_info); sb->s_fs_info = NULL; return -ENOMEM; @@ -135,6 +142,7 @@ cifs_read_super(struct super_block *sb, void *data, sb->s_magic = CIFS_MAGIC_NUMBER; sb->s_op = &cifs_super_ops; + sb->s_bdi = &cifs_sb->bdi; /* if (cifs_sb->tcon->ses->server->maxBuf > MAX_CIFS_HDR_SIZE + 512) sb->s_blocksize = cifs_sb->tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE; */ @@ -183,6 +191,7 @@ out_mount_failed: } #endif unload_nls(cifs_sb->local_nls); + bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb); } return rc; @@ -214,6 +223,7 @@ cifs_put_super(struct super_block *sb) #endif unload_nls(cifs_sb->local_nls); + bdi_destroy(&cifs_sb->bdi); kfree(cifs_sb); unlock_kernel(); -- cgit v1.2.3-70-g09d2 From 5163d90076729413cb882d3dd5c3d3cfb5b9f035 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:12:40 +0200 Subject: coda: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/coda/inode.c | 8 ++++++++ include/linux/coda_psdev.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/coda/inode.c b/fs/coda/inode.c index a1695dcadd9..d97f9935a02 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -167,6 +167,10 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) return -EBUSY; } + error = bdi_setup_and_register(&vc->bdi, "coda", BDI_CAP_MAP_COPY); + if (error) + goto bdi_err; + vc->vc_sb = sb; sb->s_fs_info = vc; @@ -175,6 +179,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) sb->s_blocksize_bits = 12; sb->s_magic = CODA_SUPER_MAGIC; sb->s_op = &coda_super_operations; + sb->s_bdi = &vc->bdi; /* get root fid from Venus: this needs the root inode */ error = venus_rootfid(sb, &fid); @@ -200,6 +205,8 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) return 0; error: + bdi_destroy(&vc->bdi); + bdi_err: if (root) iput(root); if (vc) @@ -210,6 +217,7 @@ static int coda_fill_super(struct super_block *sb, void *data, int silent) static void coda_put_super(struct super_block *sb) { + bdi_destroy(&coda_vcp(sb)->bdi); coda_vcp(sb)->vc_sb = NULL; sb->s_fs_info = NULL; diff --git a/include/linux/coda_psdev.h b/include/linux/coda_psdev.h index 5b5d4731f95..644062e8d85 100644 --- a/include/linux/coda_psdev.h +++ b/include/linux/coda_psdev.h @@ -1,6 +1,7 @@ #ifndef __CODA_PSDEV_H #define __CODA_PSDEV_H +#include #include #define CODA_PSDEV_MAJOR 67 @@ -17,6 +18,7 @@ struct venus_comm { struct list_head vc_processing; int vc_inuse; struct super_block *vc_sb; + struct backing_dev_info bdi; }; -- cgit v1.2.3-70-g09d2 From 9df9c8b930156a2f9ce2b2ae66acb14bee2663f5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:22:04 +0200 Subject: ecryptfs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/ecryptfs/ecryptfs_kernel.h | 2 ++ fs/ecryptfs/main.c | 10 +++++++++- fs/ecryptfs/super.c | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index bc7115403f3..bfc2e0f78f0 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -35,6 +35,7 @@ #include #include #include +#include /* Version verification for shared data structures w/ userspace */ #define ECRYPTFS_VERSION_MAJOR 0x00 @@ -393,6 +394,7 @@ struct ecryptfs_mount_crypt_stat { struct ecryptfs_sb_info { struct super_block *wsi_sb; struct ecryptfs_mount_crypt_stat mount_crypt_stat; + struct backing_dev_info bdi; }; /* file private data. */ diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index af1a8f01eba..760983d0f25 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -497,17 +497,25 @@ struct kmem_cache *ecryptfs_sb_info_cache; static int ecryptfs_fill_super(struct super_block *sb, void *raw_data, int silent) { + struct ecryptfs_sb_info *esi; int rc = 0; /* Released in ecryptfs_put_super() */ ecryptfs_set_superblock_private(sb, kmem_cache_zalloc(ecryptfs_sb_info_cache, GFP_KERNEL)); - if (!ecryptfs_superblock_to_private(sb)) { + esi = ecryptfs_superblock_to_private(sb); + if (!esi) { ecryptfs_printk(KERN_WARNING, "Out of memory\n"); rc = -ENOMEM; goto out; } + + rc = bdi_setup_and_register(&esi->bdi, "ecryptfs", BDI_CAP_MAP_COPY); + if (rc) + goto out; + + sb->s_bdi = &esi->bdi; sb->s_op = &ecryptfs_sops; /* Released through deactivate_super(sb) from get_sb_nodev */ sb->s_root = d_alloc(NULL, &(const struct qstr) { diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 278743c7716..0c0ae491d23 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -122,6 +122,7 @@ static void ecryptfs_put_super(struct super_block *sb) lock_kernel(); ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); + bdi_destroy(&sb_info->bdi); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); ecryptfs_set_superblock_private(sb, NULL); -- cgit v1.2.3-70-g09d2 From b3d0ab7e60d1865bb6f6a79a77aaba22f2543236 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:26:04 +0200 Subject: exofs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/exofs/exofs.h | 2 ++ fs/exofs/super.c | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 8442e353309..54373278a35 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -35,6 +35,7 @@ #include #include +#include #include "common.h" /* FIXME: Remove once pnfs hits mainline @@ -92,6 +93,7 @@ struct exofs_sb_info { struct exofs_layout layout; /* Default files layout, * contains the variable osd_dev * array. Keep last */ + struct backing_dev_info bdi; struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; diff --git a/fs/exofs/super.c b/fs/exofs/super.c index 18e57ea1e5b..03149b9a517 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -302,6 +302,7 @@ static void exofs_put_super(struct super_block *sb) _exofs_print_device("Unmounting", NULL, sbi->layout.s_ods[0], sbi->layout.s_pid); + bdi_destroy(&sbi->bdi); exofs_free_sbi(sbi); sb->s_fs_info = NULL; } @@ -546,6 +547,10 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; + ret = bdi_setup_and_register(&sbi->bdi, "exofs", BDI_CAP_MAP_COPY); + if (ret) + goto free_bdi; + /* use mount options to fill superblock */ od = osduld_path_lookup(opts->dev_name); if (IS_ERR(od)) { @@ -612,6 +617,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) } /* set up operation vectors */ + sb->s_bdi = &sbi->bdi; sb->s_fs_info = sbi; sb->s_op = &exofs_sops; sb->s_export_op = &exofs_export_ops; @@ -643,6 +649,8 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent) return 0; free_sbi: + bdi_destroy(&sbi->bdi); +free_bdi: EXOFS_ERR("Unable to mount exofs on %s pid=0x%llx err=%d\n", opts->dev_name, sbi->layout.s_pid, ret); exofs_free_sbi(sbi); -- cgit v1.2.3-70-g09d2 From f1970c73cbb6b884152207e4dfe90639f5029905 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:31:11 +0200 Subject: ncpfs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/ncpfs/inode.c | 8 ++++++++ include/linux/ncp_fs_sb.h | 2 ++ 2 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index cf98da1be23..fa338515402 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -526,10 +526,15 @@ static int ncp_fill_super(struct super_block *sb, void *raw_data, int silent) sb->s_blocksize_bits = 10; sb->s_magic = NCP_SUPER_MAGIC; sb->s_op = &ncp_sops; + sb->s_bdi = &server->bdi; server = NCP_SBP(sb); memset(server, 0, sizeof(*server)); + error = bdi_setup_and_register(&server->bdi, "ncpfs", BDI_CAP_MAP_COPY); + if (error) + goto out_bdi; + server->ncp_filp = ncp_filp; server->ncp_sock = sock; @@ -719,6 +724,8 @@ out_fput2: if (server->info_filp) fput(server->info_filp); out_fput: + bdi_destroy(&server->bdi); +out_bdi: /* 23/12/1998 Marcin Dalecki : * * The previously used put_filp(ncp_filp); was bogous, since @@ -756,6 +763,7 @@ static void ncp_put_super(struct super_block *sb) kill_pid(server->m.wdog_pid, SIGTERM, 1); put_pid(server->m.wdog_pid); + bdi_destroy(&server->bdi); kfree(server->priv.data); kfree(server->auth.object_name); vfree(server->rxbuf); diff --git a/include/linux/ncp_fs_sb.h b/include/linux/ncp_fs_sb.h index 6330fc76b00..5ec9ca67168 100644 --- a/include/linux/ncp_fs_sb.h +++ b/include/linux/ncp_fs_sb.h @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef __KERNEL__ @@ -127,6 +128,7 @@ struct ncp_server { size_t len; __u8 data[128]; } unexpected_packet; + struct backing_dev_info bdi; }; extern void ncp_tcp_rcv_proc(struct work_struct *work); -- cgit v1.2.3-70-g09d2 From 424264b7b220e8eee165dc3080ae48692af73dec Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 22 Apr 2010 12:37:07 +0200 Subject: smbfs: add bdi backing to mount session This ensures that dirty data gets flushed properly. Signed-off-by: Jens Axboe --- fs/smbfs/inode.c | 8 ++++++++ include/linux/smb_fs_sb.h | 3 +++ 2 files changed, 11 insertions(+) (limited to 'fs') diff --git a/fs/smbfs/inode.c b/fs/smbfs/inode.c index 1c4c8f08997..dfa1d67f8fc 100644 --- a/fs/smbfs/inode.c +++ b/fs/smbfs/inode.c @@ -479,6 +479,7 @@ smb_put_super(struct super_block *sb) if (server->conn_pid) kill_pid(server->conn_pid, SIGTERM, 1); + bdi_destroy(&server->bdi); kfree(server->ops); smb_unload_nls(server); sb->s_fs_info = NULL; @@ -525,6 +526,11 @@ static int smb_fill_super(struct super_block *sb, void *raw_data, int silent) if (!server) goto out_no_server; sb->s_fs_info = server; + + if (bdi_setup_and_register(&server->bdi, "smbfs", BDI_CAP_MAP_COPY)) + goto out_bdi; + + sb->s_bdi = &server->bdi; server->super_block = sb; server->mnt = NULL; @@ -624,6 +630,8 @@ out_no_smbiod: out_bad_option: kfree(mem); out_no_mem: + bdi_destroy(&server->bdi); +out_bdi: if (!server->mnt) printk(KERN_ERR "smb_fill_super: allocation failure\n"); sb->s_fs_info = NULL; diff --git a/include/linux/smb_fs_sb.h b/include/linux/smb_fs_sb.h index 8a060a7040d..bb947dd1fba 100644 --- a/include/linux/smb_fs_sb.h +++ b/include/linux/smb_fs_sb.h @@ -10,6 +10,7 @@ #define _SMB_FS_SB #include +#include #include /* @@ -74,6 +75,8 @@ struct smb_sb_info { struct smb_ops *ops; struct super_block *super_block; + + struct backing_dev_info bdi; }; static inline int -- cgit v1.2.3-70-g09d2 From 1f063d2cdf332a8a5722006b1345d15d16007c6e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 22 Apr 2010 15:35:55 -0400 Subject: NFSv4: Don't attempt an atomic open if the file is a mountpoint Fix https://bugzilla.kernel.org/show_bug.cgi?id=15789 Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index be46f26c9a5..fbb4cf79a20 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1050,7 +1050,7 @@ static int nfs_open_revalidate(struct dentry *dentry, struct nameidata *nd) struct inode *dir; int openflags, ret = 0; - if (!is_atomic_open(nd)) + if (!is_atomic_open(nd) || d_mountpoint(dentry)) goto no_open; parent = dget_parent(dentry); dir = parent->d_inode; -- cgit v1.2.3-70-g09d2 From 356e76b855bdbfd8d1c5e75bcf0c6bf0dfe83496 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Thu, 22 Apr 2010 15:35:56 -0400 Subject: NFS: rsize and wsize settings ignored on v4 mounts NFSv4 mounts ignore the rsize and wsize mount options, and always use the default transfer size for both. This seems to be because all NFSv4 mounts are now cloned, and the cloning logic doesn't copy the rsize and wsize settings from the parent nfs_server. I tested Fedora's 2.6.32.11-99 and it seems to have this problem as well, so I'm guessing that .33, .32, and perhaps older kernels have this issue as well. Signed-off-by: Chuck Lever Cc: Stable Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a8766c4ef2e..acc9c4943b8 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -966,6 +966,8 @@ out_error: static void nfs_server_copy_userdata(struct nfs_server *target, struct nfs_server *source) { target->flags = source->flags; + target->rsize = source->rsize; + target->wsize = source->wsize; target->acregmin = source->acregmin; target->acregmax = source->acregmax; target->acdirmin = source->acdirmin; -- cgit v1.2.3-70-g09d2 From cdd29ecfcb9554132cd94b82ae8b69ba37adb3b5 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 22 Apr 2010 15:35:56 -0400 Subject: nfs: testing for null instead of ERR_PTR() nfs_path() returns an ERR_PTR(), it doesn't return null. Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e01637240ee..f9327bbaf46 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2657,7 +2657,7 @@ static void nfs_fix_devname(const struct path *path, struct vfsmount *mnt) devname = nfs_path(path->mnt->mnt_devname, path->mnt->mnt_root, path->dentry, page, PAGE_SIZE); - if (devname == NULL) + if (IS_ERR(devname)) goto out_freepage; tmp = kstrdup(devname, GFP_KERNEL); if (tmp == NULL) -- cgit v1.2.3-70-g09d2 From 71d0a6112a363e703e383ae5b12c492485c39701 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 22 Apr 2010 15:35:57 -0400 Subject: NFS: Fix an unstable write data integrity race Commit 2c61be0a9478258f77b66208a0c4b1f5f8161c3c (NFS: Ensure that the WRITE and COMMIT RPC calls are always uninterruptible) exposed a race on file close. In order to ensure correct close-to-open behaviour, we want to wait for all outstanding background commit operations to complete. This patch adds an inode flag that indicates if a commit operation is under way, and provides a mechanism to allow ->write_inode() to wait for its completion if this is a data integrity flush. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 36 ++++++++++++++++++++++++++++++++---- include/linux/nfs_fs.h | 1 + 2 files changed, 33 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index de38d63aa92..ccde2aeb3fe 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1201,6 +1201,25 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) +static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +{ + if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) + return 1; + if (may_wait && !out_of_line_wait_on_bit_lock(&nfsi->flags, + NFS_INO_COMMIT, nfs_wait_bit_killable, + TASK_KILLABLE)) + return 1; + return 0; +} + +static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +{ + clear_bit(NFS_INO_COMMIT, &nfsi->flags); + smp_mb__after_clear_bit(); + wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); +} + + static void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1262,8 +1281,6 @@ static int nfs_commit_rpcsetup(struct list_head *head, task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) return PTR_ERR(task); - if (how & FLUSH_SYNC) - rpc_wait_for_completion_task(task); rpc_put_task(task); return 0; } @@ -1294,6 +1311,7 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) BDI_RECLAIMABLE); nfs_clear_page_tag_locked(req); } + nfs_commit_clear_lock(NFS_I(inode)); return -ENOMEM; } @@ -1349,6 +1367,7 @@ static void nfs_commit_release(void *calldata) next: nfs_clear_page_tag_locked(req); } + nfs_commit_clear_lock(NFS_I(data->inode)); nfs_commitdata_release(calldata); } @@ -1363,8 +1382,11 @@ static const struct rpc_call_ops nfs_commit_ops = { static int nfs_commit_inode(struct inode *inode, int how) { LIST_HEAD(head); - int res; + int may_wait = how & FLUSH_SYNC; + int res = 0; + if (!nfs_commit_set_lock(NFS_I(inode), may_wait)) + goto out; spin_lock(&inode->i_lock); res = nfs_scan_commit(inode, &head, 0, 0); spin_unlock(&inode->i_lock); @@ -1372,7 +1394,13 @@ static int nfs_commit_inode(struct inode *inode, int how) int error = nfs_commit_list(inode, &head, how); if (error < 0) return error; - } + if (may_wait) + wait_on_bit(&NFS_I(inode)->flags, NFS_INO_COMMIT, + nfs_wait_bit_killable, + TASK_KILLABLE); + } else + nfs_commit_clear_lock(NFS_I(inode)); +out: return res; } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 1a0b85aa151..07ce4609fe5 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -209,6 +209,7 @@ struct nfs_inode { #define NFS_INO_FLUSHING (4) /* inode is flushing out data */ #define NFS_INO_FSCACHE (5) /* inode can be cached by FS-Cache */ #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie management lock */ +#define NFS_INO_COMMIT (7) /* inode is committing unstable writes */ static inline struct nfs_inode *NFS_I(const struct inode *inode) { -- cgit v1.2.3-70-g09d2 From 792590c72376649b4e315df386fd208b83db05b9 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Sun, 4 Apr 2010 22:20:58 +0100 Subject: squashfs: fix locking bug in zlib wrapper Fix locking bug in zlib wrapper introduced by recent decompressor changes. Signed-off-by: Phillip Lougher --- fs/squashfs/zlib_wrapper.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c index 15a03d0fb9f..7a603874e48 100644 --- a/fs/squashfs/zlib_wrapper.c +++ b/fs/squashfs/zlib_wrapper.c @@ -128,8 +128,9 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void **buffer, goto release_mutex; } + length = stream->total_out; mutex_unlock(&msblk->read_data_mutex); - return stream->total_out; + return length; release_mutex: mutex_unlock(&msblk->read_data_mutex); -- cgit v1.2.3-70-g09d2 From 3a3076f4d6e2fa31338a0b007df42a3b32f079e0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 23 Apr 2010 12:17:17 -0400 Subject: Cleanup generic block based fiemap This cleans up a few of the complaints of __generic_block_fiemap. I've fixed all the typing stuff, used inline functions instead of macros, gotten rid of a couple of variables, and made sure the size and block requests are all block aligned. It also fixes a problem where sometimes FIEMAP_EXTENT_LAST wasn't being set properly. Signed-off-by: Josef Bacik Signed-off-by: Linus Torvalds --- fs/ioctl.c | 92 +++++++++++++++++++++++++++++++----------------------- include/linux/fs.h | 5 +-- 2 files changed, 56 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/ioctl.c b/fs/ioctl.c index 6c751106c2e..7faefb4da93 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -228,14 +228,23 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) #ifdef CONFIG_BLOCK -#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) -#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} /** * __generic_block_fiemap - FIEMAP for block based inodes (no locking) - * @inode - the inode to map - * @arg - the pointer to userspace where we copy everything to - * @get_block - the fs's get_block function + * @inode: the inode to map + * @fieinfo: the fiemap info struct that will be passed back to userspace + * @start: where to start mapping in the inode + * @len: how much space to map + * @get_block: the fs's get_block function * * This does FIEMAP for block based inodes. Basically it will just loop * through get_block until we hit the number of extents we want to map, or we @@ -250,58 +259,63 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) */ int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) + struct fiemap_extent_info *fieinfo, loff_t start, + loff_t len, get_block_t *get_block) { - struct buffer_head tmp; - unsigned long long start_blk; - long long length = 0, map_len = 0; + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); u64 logical = 0, phys = 0, size = 0; u32 flags = FIEMAP_EXTENT_MERGED; - int ret = 0, past_eof = 0, whole_file = 0; + bool past_eof = false, whole_file = false; + int ret = 0; - if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) return ret; - start_blk = logical_to_blk(inode, start); - - length = (long long)min_t(u64, len, i_size_read(inode)); - if (length < len) - whole_file = 1; + /* + * Either the i_mutex or other appropriate locking needs to be held + * since we expect isize to not change at all through the duration of + * this call. + */ + if (len >= isize) { + whole_file = true; + len = isize; + } - map_len = length; + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); do { /* * we set b_size to the total size we want so it will map as * many contiguous blocks as possible at once */ - memset(&tmp, 0, sizeof(struct buffer_head)); - tmp.b_size = map_len; + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; - ret = get_block(inode, start_blk, &tmp, 0); + ret = get_block(inode, start_blk, &map_bh, 0); if (ret) break; /* HOLE */ - if (!buffer_mapped(&tmp)) { - length -= blk_to_logical(inode, 1); + if (!buffer_mapped(&map_bh)) { start_blk++; /* - * we want to handle the case where there is an + * We want to handle the case where there is an * allocated block at the front of the file, and then * nothing but holes up to the end of the file properly, * to make sure that extent at the front gets properly * marked with FIEMAP_EXTENT_LAST */ if (!past_eof && - blk_to_logical(inode, start_blk) >= - blk_to_logical(inode, 0)+i_size_read(inode)) + blk_to_logical(inode, start_blk) >= isize) past_eof = 1; /* - * first hole after going past the EOF, this is our + * First hole after going past the EOF, this is our * last extent */ if (past_eof && size) { @@ -309,15 +323,18 @@ int __generic_block_fiemap(struct inode *inode, ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); - break; + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; } /* if we have holes up to/past EOF then we're done */ - if (length <= 0 || past_eof) + if (start_blk > last_blk || past_eof || ret) break; } else { /* - * we have gone over the length of what we wanted to + * We have gone over the length of what we wanted to * map, and it wasn't the entire file, so add the extent * we got last time and exit. * @@ -331,7 +348,7 @@ int __generic_block_fiemap(struct inode *inode, * are good to go, just add the extent to the fieinfo * and break */ - if (length <= 0 && !whole_file) { + if (start_blk > last_blk && !whole_file) { ret = fiemap_fill_next_extent(fieinfo, logical, phys, size, flags); @@ -351,11 +368,10 @@ int __generic_block_fiemap(struct inode *inode, } logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, tmp.b_blocknr); - size = tmp.b_size; + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; flags = FIEMAP_EXTENT_MERGED; - length -= tmp.b_size; start_blk += logical_to_blk(inode, size); /* @@ -363,15 +379,13 @@ int __generic_block_fiemap(struct inode *inode, * soon as we find a hole that the last extent we found * is marked with FIEMAP_EXTENT_LAST */ - if (!past_eof && - logical+size >= - blk_to_logical(inode, 0)+i_size_read(inode)) - past_eof = 1; + if (!past_eof && logical + size >= isize) + past_eof = true; } cond_resched(); } while (1); - /* if ret is 1 then we just hit the end of the extent array */ + /* If ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; diff --git a/include/linux/fs.h b/include/linux/fs.h index 39d57bc6cc7..44f35aea2f1 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2315,8 +2315,9 @@ extern int vfs_fstatat(int , char __user *, struct kstat *, int); extern int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, unsigned long arg); extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); + struct fiemap_extent_info *fieinfo, + loff_t start, loff_t len, + get_block_t *get_block); extern int generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len, get_block_t *get_block); -- cgit v1.2.3-70-g09d2 From d4cd1871cff68e188dadcf6d1280762522b643eb Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Thu, 22 Apr 2010 16:11:19 +0800 Subject: ocfs2: add OCFS2_INODE_SKIP_ORPHAN_DIR flag and honor it in the inode wipe code Currently in the error path of ocfs2_symlink and ocfs2_mknod, we just call iput with the inode we failed with, but the inode wipe code will complain because we don't add the inode to orphan dir. One solution would be to lock the orphan dir during the entire transaction, but that's too heavy for a rare error path. Instead, we add a flag, OCFS2_INODE_SKIP_ORPHAN_DIR which tells the inode wipe code that it won't find this inode in the orphan dir. [ Merge fixes and comment style cleanups -Mark ] Signed-off-by: Li Dongyang Signed-off-by: Mark Fasheh --- fs/ocfs2/inode.c | 65 +++++++++++++++++++++++++++++++------------------------- fs/ocfs2/inode.h | 2 ++ fs/ocfs2/namei.c | 1 + 3 files changed, 39 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 07cc8bb68b6..26399202be7 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -639,11 +639,13 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail_unlock; } - status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, - orphan_dir_bh); - if (status < 0) { - mlog_errno(status); - goto bail_commit; + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + status = ocfs2_orphan_del(osb, handle, orphan_dir_inode, inode, + orphan_dir_bh); + if (status < 0) { + mlog_errno(status); + goto bail_commit; + } } /* set the inodes dtime */ @@ -726,34 +728,35 @@ static int ocfs2_wipe_inode(struct inode *inode, struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_dinode *di; + struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; - di = (struct ocfs2_dinode *) di_bh->b_data; - orphaned_slot = le16_to_cpu(di->i_orphaned_slot); + if (!(OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { + orphaned_slot = le16_to_cpu(di->i_orphaned_slot); - status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); - if (status) - return status; + status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot); + if (status) + return status; - orphan_dir_inode = ocfs2_get_system_file_inode(osb, - ORPHAN_DIR_SYSTEM_INODE, - orphaned_slot); - if (!orphan_dir_inode) { - status = -EEXIST; - mlog_errno(status); - goto bail; - } + orphan_dir_inode = ocfs2_get_system_file_inode(osb, + ORPHAN_DIR_SYSTEM_INODE, + orphaned_slot); + if (!orphan_dir_inode) { + status = -EEXIST; + mlog_errno(status); + goto bail; + } - /* Lock the orphan dir. The lock will be held for the entire - * delete_inode operation. We do this now to avoid races with - * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); - status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); - if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + /* Lock the orphan dir. The lock will be held for the entire + * delete_inode operation. We do this now to avoid races with + * recovery completion on other nodes. */ + mutex_lock(&orphan_dir_inode->i_mutex); + status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); + if (status < 0) { + mutex_unlock(&orphan_dir_inode->i_mutex); - mlog_errno(status); - goto bail; + mlog_errno(status); + goto bail; + } } /* we do this while holding the orphan dir lock because we @@ -794,6 +797,9 @@ static int ocfs2_wipe_inode(struct inode *inode, mlog_errno(status); bail_unlock_dir: + if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR) + return status; + ocfs2_inode_unlock(orphan_dir_inode, 1); mutex_unlock(&orphan_dir_inode->i_mutex); brelse(orphan_dir_bh); @@ -889,7 +895,8 @@ static int ocfs2_query_inode_wipe(struct inode *inode, /* Do some basic inode verification... */ di = (struct ocfs2_dinode *) di_bh->b_data; - if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { + if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL)) && + !(oi->ip_flags & OCFS2_INODE_SKIP_ORPHAN_DIR)) { /* * Inodes in the orphan dir must have ORPHANED_FL. The only * inodes that come back out of the orphan dir are reflink diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index ba4fe07b293..0b28e1921a3 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -100,6 +100,8 @@ struct ocfs2_inode_info #define OCFS2_INODE_MAYBE_ORPHANED 0x00000020 /* Does someone have the file open O_DIRECT */ #define OCFS2_INODE_OPEN_DIRECT 0x00000040 +/* Tell the inode wipe code it's not in orphan dir */ +#define OCFS2_INODE_SKIP_ORPHAN_DIR 0x00000080 static inline struct ocfs2_inode_info *OCFS2_I(struct inode *inode) { diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b1eb50ae409..ae315c9c768 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1976,6 +1976,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, } le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); + OCFS2_I(inode)->ip_flags &= ~OCFS2_INODE_SKIP_ORPHAN_DIR; /* Record which orphan dir our inode now resides * in. delete_inode will use this to determine which orphan -- cgit v1.2.3-70-g09d2 From ab41fdc8fdd9f0942430941c1e2b516fd481371d Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Thu, 22 Apr 2010 16:11:25 +0800 Subject: ocfs2: use OCFS2_INODE_SKIP_ORPHAN_DIR in ocfs2_symlink error path Mark the inode with flag OCFS2_INODE_SKIP_ORPHAN_DIR when we get an error after allocating one, so that we can kill the inode. Signed-off-by: Li Dongyang Signed-off-by: Mark Fasheh --- fs/ocfs2/namei.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ae315c9c768..b66e4885582 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1811,6 +1811,7 @@ bail: if (xattr_ac) ocfs2_free_alloc_context(xattr_ac); if ((status < 0) && inode) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; clear_nlink(inode); iput(inode); } -- cgit v1.2.3-70-g09d2 From 062d340384dcf77dfd8de0a082b5da571de3925a Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Thu, 22 Apr 2010 16:11:29 +0800 Subject: ocfs2: use OCFS2_INODE_SKIP_ORPHAN_DIR in ocfs2_mknod error path Mark the inode with flag OCFS2_INODE_SKIP_ORPHAN_DIR in ocfs2_mknod, so we can kill the inode in case of error. [ Fixed up comment style -Mark ] Signed-off-by: Li Dongyang Signed-off-by: Mark Fasheh --- fs/ocfs2/namei.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index b66e4885582..8ff035eabfd 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -445,11 +445,6 @@ leave: ocfs2_free_dir_lookup_result(&lookup); - if ((status < 0) && inode) { - clear_nlink(inode); - iput(inode); - } - if (inode_ac) ocfs2_free_alloc_context(inode_ac); @@ -459,6 +454,17 @@ leave: if (meta_ac) ocfs2_free_alloc_context(meta_ac); + /* + * We should call iput after the i_mutex of the bitmap been + * unlocked in ocfs2_free_alloc_context, or the + * ocfs2_delete_inode will mutex_lock again. + */ + if ((status < 0) && inode) { + OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SKIP_ORPHAN_DIR; + clear_nlink(inode); + iput(inode); + } + mlog_exit(status); return status; -- cgit v1.2.3-70-g09d2 From a9743fcdc0eb43d028b71267438076e1b0112ba0 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Fri, 23 Apr 2010 11:42:22 -0700 Subject: ocfs2: Add directory entry later in ocfs2_symlink() and ocfs2_mknod() If we get a failure during creation of an inode we'll allow the orphan code to remove the inode, which is correct. However, we need to ensure that we don't get any errors after the call to ocfs2_add_entry(), otherwise we could leave a dangling directory reference. The solution is simple - in both cases, all I had to do was move ocfs2_dentry_attach_lock() above the ocfs2_add_entry() call. Signed-off-by: Mark Fasheh --- fs/ocfs2/namei.c | 40 +++++++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8ff035eabfd..4cbb18f26c5 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -408,23 +408,28 @@ static int ocfs2_mknod(struct inode *dir, } } - status = ocfs2_add_entry(handle, dentry, inode, - OCFS2_I(inode)->ip_blkno, parent_fe_bh, - &lookup); - if (status < 0) { + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, + OCFS2_I(dir)->ip_blkno); + if (status) { mlog_errno(status); goto leave; } + dentry->d_op = &ocfs2_dentry_ops; - status = ocfs2_dentry_attach_lock(dentry, inode, - OCFS2_I(dir)->ip_blkno); - if (status) { + status = ocfs2_add_entry(handle, dentry, inode, + OCFS2_I(inode)->ip_blkno, parent_fe_bh, + &lookup); + if (status < 0) { mlog_errno(status); goto leave; } insert_inode_hash(inode); - dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; leave: @@ -1777,22 +1782,27 @@ static int ocfs2_symlink(struct inode *dir, } } - status = ocfs2_add_entry(handle, dentry, inode, - le64_to_cpu(fe->i_blkno), parent_fe_bh, - &lookup); - if (status < 0) { + /* + * Do this before adding the entry to the directory. We add + * also set d_op after success so that ->d_iput() will cleanup + * the dentry lock even if ocfs2_add_entry() fails below. + */ + status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); + if (status) { mlog_errno(status); goto bail; } + dentry->d_op = &ocfs2_dentry_ops; - status = ocfs2_dentry_attach_lock(dentry, inode, OCFS2_I(dir)->ip_blkno); - if (status) { + status = ocfs2_add_entry(handle, dentry, inode, + le64_to_cpu(fe->i_blkno), parent_fe_bh, + &lookup); + if (status < 0) { mlog_errno(status); goto bail; } insert_inode_hash(inode); - dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); bail: if (status < 0 && did_quota) -- cgit v1.2.3-70-g09d2 From 0350cb078f5035716ebdad4ad4709d02fe466a8a Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 22 Apr 2010 11:39:29 +0200 Subject: ocfs2: potential ERR_PTR dereference on error paths If "handle" is non null at the end of the function then we assume it's a valid pointer and pass it to ocfs2_commit_trans(); Signed-off-by: Dan Carpenter Cc: Signed-off-by: Joel Becker --- fs/ocfs2/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index ab207901d32..23c254e2601 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -559,6 +559,7 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); + handle = NULL; mlog_errno(status); goto out; } -- cgit v1.2.3-70-g09d2 From c21a534e2f24968cf74976a4e721ac194db30ded Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 21 Apr 2010 14:05:55 +0800 Subject: ocfs2: Update VFS inode's id info after reflink. In reflink we update the id info on the disk but forgot to update the corresponding information in the VFS inode. Update them accordingly when we want to preserve the attributes. Reported-by: Jeff Liu Signed-off-by: Tao Ma Cc: Signed-off-by: Joel Becker --- fs/ocfs2/refcounttree.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 29405f2ff61..32a8ac589c8 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4084,6 +4084,9 @@ static int ocfs2_complete_reflink(struct inode *s_inode, di->i_attr = s_di->i_attr; if (preserve) { + t_inode->i_uid = s_inode->i_uid; + t_inode->i_gid = s_inode->i_gid; + t_inode->i_mode = s_inode->i_mode; di->i_uid = s_di->i_uid; di->i_gid = s_di->i_gid; di->i_mode = s_di->i_mode; -- cgit v1.2.3-70-g09d2 From a36d515c7a2dfacebcf41729f6812dbc424ebcf0 Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Fri, 23 Apr 2010 15:24:59 -0700 Subject: ocfs2_dlmfs: Fix math error when reading LVB. When asked for a partial read of the LVB in a dlmfs file, we can accidentally calculate a negative count. Reported-by: Dan Carpenter Cc: Signed-off-by: Joel Becker --- fs/ocfs2/dlmfs/dlmfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index a99d1eafa8e..b83d6107a1f 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -262,7 +262,7 @@ static ssize_t dlmfs_file_read(struct file *filp, if ((count + *ppos) > i_size_read(inode)) readlen = i_size_read(inode) - *ppos; else - readlen = count - *ppos; + readlen = count; lvb_buf = kmalloc(readlen, GFP_NOFS); if (!lvb_buf) -- cgit v1.2.3-70-g09d2 From cac36f707119b792b2396aed371d6b5cdc194890 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 23 Apr 2010 13:17:37 -0400 Subject: reiserfs: fix permissions on .reiserfs_priv Commit 677c9b2e393a0cd203bd54e9c18b012b2c73305a ("reiserfs: remove privroot hiding in lookup") removed the magic from the lookup code to hide the .reiserfs_priv directory since it was getting loaded at mount-time instead. The intent was that the entry would be hidden from the user via a poisoned d_compare, but this was faulty. This introduced a security issue where unprivileged users could access and modify extended attributes or ACLs belonging to other users, including root. This patch resolves the issue by properly hiding .reiserfs_priv. This was the intent of the xattr poisoning code, but it appears to have never worked as expected. This is fixed by using d_revalidate instead of d_compare. This patch makes -oexpose_privroot a no-op. I'm fine leaving it this way. The effort involved in working out the corner cases wrt permissions and caching outweigh the benefit of the feature. Signed-off-by: Jeff Mahoney Acked-by: Edward Shishkin Reported-by: Matt McCutchen Tested-by: Matt McCutchen Cc: Frederic Weisbecker Cc: Al Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/dir.c | 2 -- fs/reiserfs/xattr.c | 17 ++++------------- 2 files changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index f8a6075abf5..07930449a95 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -46,8 +46,6 @@ static inline bool is_privroot_deh(struct dentry *dir, struct reiserfs_de_head *deh) { struct dentry *privroot = REISERFS_SB(dir->d_sb)->priv_root; - if (reiserfs_expose_privroot(dir->d_sb)) - return 0; return (dir == dir->d_parent && privroot->d_inode && deh->deh_objectid == INODE_PKEY(privroot->d_inode)->k_objectid); } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 4f9586bb763..28f0a448f3c 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -973,21 +973,13 @@ int reiserfs_permission(struct inode *inode, int mask) return generic_permission(inode, mask, NULL); } -/* This will catch lookups from the fs root to .reiserfs_priv */ -static int -xattr_lookup_poison(struct dentry *dentry, struct qstr *q1, struct qstr *name) +static int xattr_hide_revalidate(struct dentry *dentry, struct nameidata *nd) { - struct dentry *priv_root = REISERFS_SB(dentry->d_sb)->priv_root; - if (container_of(q1, struct dentry, d_name) == priv_root) - return -ENOENT; - if (q1->len == name->len && - !memcmp(q1->name, name->name, name->len)) - return 0; - return 1; + return -EPERM; } static const struct dentry_operations xattr_lookup_poison_ops = { - .d_compare = xattr_lookup_poison, + .d_revalidate = xattr_hide_revalidate, }; int reiserfs_lookup_privroot(struct super_block *s) @@ -1001,8 +993,7 @@ int reiserfs_lookup_privroot(struct super_block *s) strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { REISERFS_SB(s)->priv_root = dentry; - if (!reiserfs_expose_privroot(s)) - s->s_root->d_op = &xattr_lookup_poison_ops; + dentry->d_op = &xattr_lookup_poison_ops; if (dentry->d_inode) dentry->d_inode->i_flags |= S_PRIVATE; } else -- cgit v1.2.3-70-g09d2 From fb2162df74bb19552db3d988fd11c787cf5fad56 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Fri, 23 Apr 2010 13:17:41 -0400 Subject: reiserfs: fix corruption during shrinking of xattrs Commit 48b32a3553a54740d236b79a90f20147a25875e3 ("reiserfs: use generic xattr handlers") introduced a problem that causes corruption when extended attributes are replaced with a smaller value. The issue is that the reiserfs_setattr to shrink the xattr file was moved from before the write to after the write. The root issue has always been in the reiserfs xattr code, but was papered over by the fact that in the shrink case, the file would just be expanded again while the xattr was written. The end result is that the last 8 bytes of xattr data are lost. This patch fixes it to use new_size. Addresses https://bugzilla.kernel.org/show_bug.cgi?id=14826 Signed-off-by: Jeff Mahoney Reported-by: Christian Kujau Tested-by: Christian Kujau Cc: Edward Shishkin Cc: Jethro Beekman Cc: Greg Surbey Cc: Marco Gatti Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/reiserfs/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 28f0a448f3c..e7cc00e636d 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -554,7 +554,7 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (!err && new_size < i_size_read(dentry->d_inode)) { struct iattr newattrs = { .ia_ctime = current_fs_time(inode->i_sb), - .ia_size = buffer_size, + .ia_size = new_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; -- cgit v1.2.3-70-g09d2 From b8af67e2681c693a21f3933e3bdfce4cf66596d3 Mon Sep 17 00:00:00 2001 From: Anton Blanchard Date: Fri, 23 Apr 2010 13:18:06 -0400 Subject: fs/block_dev.c: fix performance regression in O_DIRECT|O_SYNC writes to block devices We are seeing a large regression in database performance on recent kernels. The database opens a block device with O_DIRECT|O_SYNC and a number of threads write to different regions of the file at the same time. A simple test case is below. I haven't defined DEVICE since getting it wrong will destroy your data :) On an 3 disk LVM with a 64k chunk size we see about 17MB/sec and only a few threads in IO wait: procs -----io---- -system-- -----cpu------ r b bi bo in cs us sy id wa st 0 3 0 16170 656 2259 0 0 86 14 0 0 2 0 16704 695 2408 0 0 92 8 0 0 2 0 17308 744 2653 0 0 86 14 0 0 2 0 17933 759 2777 0 0 89 10 0 Most threads are blocking in vfs_fsync_range, which has: mutex_lock(&mapping->host->i_mutex); err = fop->fsync(file, dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); commit 148f948ba877f4d3cdef036b1ff6d9f68986706a (vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode) offers some explanation of what is going on: Use these new helpers for syncing from generic VFS functions. This makes O_SYNC writes to block devices acquire i_mutex for syncing. If we really care about this, we can make block_fsync() drop the i_mutex and reacquire it before it returns. Thanks Jan for such a good commit message! As well as dropping i_mutex, Christoph suggests we should remove the call to sync_blockdev(): > sync_blockdev is an overcomplicated alias for filemap_write_and_wait on > the block device inode, which is exactly what we did just before calling > into ->fsync The patch below incorporates both suggestions. With it the testcase improves from 17MB/s to 68M/sec: procs -----io---- -system-- -----cpu------ r b bi bo in cs us sy id wa st 0 7 0 65536 1000 3878 0 0 70 30 0 0 34 0 69632 1016 3921 0 1 46 53 0 0 57 0 69632 1000 3921 0 0 55 45 0 0 53 0 69640 754 4111 0 0 81 19 0 Testcase: #define _GNU_SOURCE #include #include #include #include #include #include #include #include #define NR_THREADS 64 #define BUFSIZE (64 * 1024) #define DEVICE "/dev/mapper/XXXXXX" #define ALIGN(VAL, SIZE) (((VAL)+(SIZE)-1) & ~((SIZE)-1)) static int fd; static void *doit(void *arg) { unsigned long offset = (long)arg; char *b, *buf; b = malloc(BUFSIZE + 1024); buf = (char *)ALIGN((unsigned long)b, 1024); memset(buf, 0, BUFSIZE); while (1) pwrite(fd, buf, BUFSIZE, offset); } int main(int argc, char *argv[]) { int flags = O_RDWR|O_DIRECT; int i; unsigned long offset = 0; if (argc > 1 && !strcmp(argv[1], "O_SYNC")) flags |= O_SYNC; fd = open(DEVICE, flags); if (fd == -1) { perror("open"); exit(1); } for (i = 0; i < NR_THREADS-1; i++) { pthread_t tid; pthread_create(&tid, NULL, doit, (void *)offset); offset += BUFSIZE; } doit((void *)offset); return 0; } Signed-off-by: Anton Blanchard Acked-by: Jan Kara Cc: Christoph Hellwig Cc: Alexander Viro Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 2a6d0193f13..6dcee88c2e5 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -406,16 +406,23 @@ static loff_t block_llseek(struct file *file, loff_t offset, int origin) int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct inode *bd_inode = filp->f_mapping->host; + struct block_device *bdev = I_BDEV(bd_inode); int error; - error = sync_blockdev(bdev); - if (error) - return error; - + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + mutex_unlock(&bd_inode->i_mutex); + error = blkdev_issue_flush(bdev, NULL); if (error == -EOPNOTSUPP) error = 0; + + mutex_lock(&bd_inode->i_mutex); + return error; } EXPORT_SYMBOL(blkdev_fsync); -- cgit v1.2.3-70-g09d2 From 1cb08e97389bb603e1b999312d9686c8faf0187a Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 16 Apr 2010 01:01:36 +0100 Subject: squashfs: fix warn_on when root inode is corrupted Fix warn_on triggered by mounting a fsfuzzer corrupted file system, where the root inode has been corrupted. Signed-off-by: Phillip Lougher Reported-by: Steve Grubb --- fs/squashfs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 3550aec2f65..07ceeb8d8f5 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -275,7 +275,8 @@ allocate_root: err = squashfs_read_inode(root, root_inode); if (err) { - iget_failed(root); + make_bad_inode(root); + iput(root); goto failed_mount; } insert_inode_hash(root); -- cgit v1.2.3-70-g09d2 From 370ec3d1ed9d76ba992e5b9b7d7d10700014d436 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 23 Apr 2010 00:24:22 +0100 Subject: squashfs: add missing buffer free Signed-off-by: Phillip Lougher --- fs/squashfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 07ceeb8d8f5..48b6f4a385a 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -354,6 +354,7 @@ static void squashfs_put_super(struct super_block *sb) kfree(sbi->id_table); kfree(sbi->fragment_index); kfree(sbi->meta_index); + kfree(sbi->inode_lookup_table); kfree(sb->s_fs_info); sb->s_fs_info = NULL; } -- cgit v1.2.3-70-g09d2 From e0d1f70010dce062ccce1bbd940a661e60b82631 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Fri, 23 Apr 2010 02:32:02 +0100 Subject: squashfs: fix potential buffer over-run on 4K block file systems Sizing the buffer based on block size is incorrect, leading to a potential buffer over-run on 4K block size file systems (because the metadata block size is always 8K). This bug doesn't seem have triggered because 4K block size file systems are not default, and also because metadata blocks after compression tend to be less than 4K. Signed-off-by: Phillip Lougher --- fs/squashfs/block.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 1cb0d81b164..653c030eb84 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -87,9 +87,8 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, u64 cur_index = index >> msblk->devblksize_log2; int bytes, compressed, b = 0, k = 0, page = 0, avail; - - bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, - sizeof(*bh), GFP_KERNEL); + bh = kcalloc(((srclength + msblk->devblksize - 1) + >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL); if (bh == NULL) return -ENOMEM; -- cgit v1.2.3-70-g09d2 From 5129a469a91a91427334c40e29e64c6d0ab68caf Mon Sep 17 00:00:00 2001 From: Jörn Engel Date: Sun, 25 Apr 2010 08:54:42 +0200 Subject: Catch filesystems lacking s_bdi noop_backing_dev_info is used only as a flag to mark filesystems that don't have any backing store, like tmpfs, procfs, spufs, etc. Signed-off-by: Joern Engel Changed the BUG_ON() to a WARN_ON(). Note that adding dirty inodes to the noop_backing_dev_info is not legal and will not result in them being flushed, but we already catch this condition in __mark_inode_dirty() when checking for a registered bdi. Signed-off-by: Jens Axboe --- fs/super.c | 8 +++++--- fs/sync.c | 3 ++- include/linux/backing-dev.h | 1 + mm/backing-dev.c | 5 +++++ 4 files changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index f35ac602210..dc72491a19f 100644 --- a/fs/super.c +++ b/fs/super.c @@ -693,6 +693,7 @@ int set_anon_super(struct super_block *s, void *data) return -EMFILE; } s->s_dev = MKDEV(0, dev & MINORMASK); + s->s_bdi = &noop_backing_dev_info; return 0; } @@ -954,10 +955,11 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (error < 0) goto out_free_secdata; BUG_ON(!mnt->mnt_sb); + WARN_ON(!mnt->mnt_sb->s_bdi); - error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); - if (error) - goto out_sb; + error = security_sb_kern_mount(mnt->mnt_sb, flags, secdata); + if (error) + goto out_sb; /* * filesystems should never set s_maxbytes larger than MAX_LFS_FILESIZE diff --git a/fs/sync.c b/fs/sync.c index fc5c3d75cf3..92b228176f7 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ @@ -32,7 +33,7 @@ static int __sync_filesystem(struct super_block *sb, int wait) * This should be safe, as we require bdi backing to actually * write out data in the first place */ - if (!sb->s_bdi) + if (!sb->s_bdi || sb->s_bdi == &noop_backing_dev_info) return 0; if (sb->s_qcop && sb->s_qcop->quota_sync) diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index e19c677f219..bd0e3c6f323 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -247,6 +247,7 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); #endif extern struct backing_dev_info default_backing_dev_info; +extern struct backing_dev_info noop_backing_dev_info; void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page); int writeback_in_progress(struct backing_dev_info *bdi); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dbda4707f59..707d0dc6da0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -27,6 +27,11 @@ struct backing_dev_info default_backing_dev_info = { }; EXPORT_SYMBOL_GPL(default_backing_dev_info); +struct backing_dev_info noop_backing_dev_info = { + .name = "noop", +}; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); + static struct class *bdi_class; /* -- cgit v1.2.3-70-g09d2 From e6d086d83cf7f102d48c006f58172a69ec0c15a4 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 26 Apr 2010 10:27:54 +0200 Subject: btrfs: convert to using bdi_setup_and_register() It's now a provided helper, so get rid of the internal setup and btrfs atomic_t bdi enumerator. Signed-off-by: Jens Axboe --- fs/btrfs/disk-io.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e7b8f2c89cc..feca04197d0 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -44,8 +44,6 @@ static struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); -static atomic_t btrfs_bdi_num = ATOMIC_INIT(0); - /* * end_io_wq structs are used to do processing in task context when an IO is * complete. This is used during reads to verify checksums, and it is used @@ -1375,19 +1373,11 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; - bdi->name = "btrfs"; bdi->capabilities = BDI_CAP_MAP_COPY; - err = bdi_init(bdi); + err = bdi_setup_and_register(bdi, "btrfs", BDI_CAP_MAP_COPY); if (err) return err; - err = bdi_register(bdi, NULL, "btrfs-%d", - atomic_inc_return(&btrfs_bdi_num)); - if (err) { - bdi_destroy(bdi); - return err; - } - bdi->ra_pages = default_backing_dev_info.ra_pages; bdi->unplug_io_fn = btrfs_unplug_io_fn; bdi->unplug_io_data = info; -- cgit v1.2.3-70-g09d2 From dd77ef924c835c9813c3f4dc7e9c72e9cd88d238 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Tue, 20 Apr 2010 17:00:37 +1000 Subject: xfs: more swap extent fixes for dynamic fork offsets A new xfsqa test (226) with a prototype xfs_fsr change to try to handle dynamic fork offsets better triggers an assertion failure where the inode data fork is in btree format, yet there is room in the inode for it to be in extent format. The two inodes look like: before: ino 0x101 (target), num_extents 11, Max in-fork extents 6, broot size 40, fork offset 96 before: ino 0x115 (temp), num_extents 5, Max in-fork extents 3, broot size 40, fork offset 56 after: ino 0x101 (target), num_extents 5, Max in-fork extents 6, broot size 40, fork offset 96 after: ino 0x115 (temp), num_extents 11, Max in-fork extents 3, broot size 40, fork offset 56 Basically the target inode ends up with 5 extents in btree format, but it had space for 6 extents in extent format, so ends up incorrect. Notably here the broot size is the same, and that is where the kernel code is going wrong - the btree root will fit, so it lets the swap go ahead. The check should not allow the swap to take place if the number of extents while in btree format is less than the number of extents that can fit in the inode in extent format. Adding that check will prevent this swap and corruption from occurring. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/xfs_dfrag.c | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index cd27c9d6c71..5bba29a0781 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -177,16 +177,26 @@ xfs_swap_extents_check_format( XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) return EINVAL; - /* Check root block of temp in btree form to max in target */ + /* + * If we are in a btree format, check that the temp root block will fit + * in the target and that it has enough extents to be in btree format + * in the target. + * + * Note that we have to be careful to allow btree->extent conversions + * (a common defrag case) which will occur when the temp inode is in + * extent format... + */ if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && - XFS_IFORK_BOFF(ip) && - tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + ((XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) || + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) <= ip->i_df.if_ext_max)) return EINVAL; - /* Check root block of target in btree form to max in temp */ + /* Reciprocal target->temp btree format checks */ if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && - XFS_IFORK_BOFF(tip) && - ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + ((XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) || + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) <= tip->i_df.if_ext_max)) return EINVAL; return 0; -- cgit v1.2.3-70-g09d2 From 2bc3c1179c781b359d4f2f3439cb3df72afc17fc Mon Sep 17 00:00:00 2001 From: Neil Brown Date: Tue, 20 Apr 2010 12:16:52 +1000 Subject: nfsd4: bug in read_buf When read_buf is called to move over to the next page in the pagelist of an NFSv4 request, it sets argp->end to essentially a random number, certainly not an address within the page which argp->p now points to. So subsequent calls to READ_BUF will think there is much more than a page of spare space (the cast to u32 ensures an unsigned comparison) so we can expect to fall off the end of the second page. We never encountered thsi in testing because typically the only operations which use more than two pages are write-like operations, which have their own decoding logic. Something like a getattr after a write may cross a page boundary, but it would be very unusual for it to cross another boundary after that. Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e1703175ee2..34ccf815ea8 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -161,10 +161,10 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = p + (argp->pagelen>>2); + argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; } else { - argp->end = p + (PAGE_SIZE>>2); + argp->end = argp->p + (PAGE_SIZE>>2); argp->pagelen -= PAGE_SIZE; } memcpy(((char*)p)+avail, argp->p, (nbytes - avail)); @@ -1426,10 +1426,10 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = p + (argp->pagelen>>2); + argp->end = argp->p + (argp->pagelen>>2); argp->pagelen = 0; } else { - argp->end = p + (PAGE_SIZE>>2); + argp->end = argp->p + (PAGE_SIZE>>2); argp->pagelen -= PAGE_SIZE; } } -- cgit v1.2.3-70-g09d2 From 16a5b3c4143fc7f6cbe0ef9fd4e9a58376f91506 Mon Sep 17 00:00:00 2001 From: Christoph Egger Date: Mon, 26 Apr 2010 15:56:36 +0100 Subject: Remove redundant check for CONFIG_MMU The checks for CONFIG_MMU at this location are duplicated as all the code is located inside a #ifndef CONFIG_MMU block. So the first conditional block will always be included while the second never will. Signed-off-by: Christoph Egger Signed-off-by: David Howells Signed-off-by: Linus Torvalds --- fs/binfmt_elf_fdpic.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 7ab23e006e4..2c5f9a0e5d7 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1005,15 +1005,8 @@ static int elf_fdpic_map_file_constdisp_on_uclinux( } } else if (!mm->start_data) { mm->start_data = seg->addr; -#ifndef CONFIG_MMU mm->end_data = seg->addr + phdr->p_memsz; -#endif } - -#ifdef CONFIG_MMU - if (seg->addr + phdr->p_memsz > mm->end_data) - mm->end_data = seg->addr + phdr->p_memsz; -#endif } seg++; -- cgit v1.2.3-70-g09d2 From ba8b06e67ed7a560b0e7c80091bcadda4f4727a5 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 27 Apr 2010 18:33:54 -0400 Subject: NFS: Ensure that nfs_wb_page() waits for Pg_writeback to clear Neil Brown reports that he is seeing the BUG_ON(ret == 0) trigger in nfs_page_async_flush. According to the trace in https://bugzilla.novell.com/show_bug.cgi?id=599628 the problem appears to be due to nfs_wb_page() not waiting for the PG_writeback flag to clear. There is a ditto problem in nfs_wb_page_cancel() Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 19 ++++--------------- 1 file changed, 4 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ccde2aeb3fe..3aea3ca98ab 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1472,6 +1472,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) BUG_ON(!PageLocked(page)); for (;;) { + wait_on_page_writeback(page); req = nfs_page_find_request(page); if (req == NULL) break; @@ -1506,30 +1507,18 @@ int nfs_wb_page(struct inode *inode, struct page *page) .range_start = range_start, .range_end = range_end, }; - struct nfs_page *req; - int need_commit; int ret; while(PagePrivate(page)) { + wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { ret = nfs_writepage_locked(page, &wbc); if (ret < 0) goto out_error; } - req = nfs_find_and_lock_request(page); - if (!req) - break; - if (IS_ERR(req)) { - ret = PTR_ERR(req); + ret = sync_inode(inode, &wbc); + if (ret < 0) goto out_error; - } - need_commit = test_bit(PG_CLEAN, &req->wb_flags); - nfs_clear_page_tag_locked(req); - if (need_commit) { - ret = nfs_commit_inode(inode, FLUSH_SYNC); - if (ret < 0) - goto out_error; - } } return 0; out_error: -- cgit v1.2.3-70-g09d2 From 3835541dd481091c4dbf5ef83c08aed12e50fd61 Mon Sep 17 00:00:00 2001 From: Jerome Marchand Date: Tue, 27 Apr 2010 13:13:06 -0700 Subject: procfs: fix tid fdinfo Correct the file_operations struct in fdinfo entry of tid_base_stuff[]. Presently /proc/*/task/*/fdinfo contains symlinks to opened files like /proc/*/fd/. Signed-off-by: Jerome Marchand Cc: Alexander Viro Cc: Miklos Szeredi Cc: Alexey Dobriyan Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 7621db800a7..8418fcc0a6a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2909,7 +2909,7 @@ out_no_task: */ static const struct pid_entry tid_base_stuff[] = { DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), - DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), REG("environ", S_IRUSR, proc_environ_operations), INF("auxv", S_IRUSR, proc_pid_auxv), ONE("status", S_IRUGO, proc_pid_status), -- cgit v1.2.3-70-g09d2 From acf82b85a70f39786e3cbb1ffed8655bcc972424 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 22 Apr 2010 11:28:39 +0200 Subject: nfs: fix some issues in nfs41_proc_reclaim_complete() The original code passed an ERR_PTR() to rpc_put_task() and instead of returning zero on success it returned -ENOMEM. Signed-off-by: Dan Carpenter Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 638067007c6..071fcedd517 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -5218,9 +5218,12 @@ static int nfs41_proc_reclaim_complete(struct nfs_client *clp) msg.rpc_resp = &calldata->res; task_setup_data.callback_data = calldata; task = rpc_run_task(&task_setup_data); - if (IS_ERR(task)) + if (IS_ERR(task)) { status = PTR_ERR(task); + goto out; + } rpc_put_task(task); + return 0; out: dprintk("<-- %s status=%d\n", __func__, status); return status; -- cgit v1.2.3-70-g09d2 From 9699eda6bc1f708a28acb716e1477aa351362fe2 Mon Sep 17 00:00:00 2001 From: Xiaotian Feng Date: Thu, 22 Apr 2010 18:56:17 +0800 Subject: nfs: fix memory leak in nfs_get_sb with CONFIG_NFS_V4 With CONFIG_NFS_V4 and data version 4, nfs_get_sb will allocate memory for export_path in nfs4_validate_text_mount_data, so we need to free it then. This is addressed in following kmemleak report: unreferenced object 0xffff88016bf48a50 (size 16): comm "mount.nfs", pid 22567, jiffies 4651574704 (age 175471.200s) hex dump (first 16 bytes): 2f 6f 70 74 2f 77 6f 72 6b 00 6b 6b 6b 6b 6b a5 /opt/work.kkkkk. backtrace: [] kmemleak_alloc+0x60/0xa7 [] kmemleak_alloc_recursive.clone.5+0x1b/0x1d [] __kmalloc_track_caller+0x18f/0x1b7 [] kstrndup+0x37/0x54 [] nfs_parse_devname+0x152/0x204 [nfs] [] nfs4_validate_text_mount_data+0xd0/0xdc [nfs] [] nfs_get_sb+0x325/0x736 [nfs] [] vfs_kern_mount+0xbd/0x17c [] do_kern_mount+0x4d/0xed [] do_mount+0x787/0x7fe [] sys_mount+0x88/0xc2 [] system_call_fastpath+0x16/0x1b Signed-off-by: Xiaotian Feng Cc: Trond Myklebust Cc: Chuck Lever Cc: Benny Halevy Cc: Al Viro Cc: Andy Adamson Signed-off-by: Trond Myklebust --- fs/nfs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9327bbaf46..b4148fc00f9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2187,6 +2187,7 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (data->version == 4) { error = nfs4_try_mount(flags, dev_name, data, mnt); kfree(data->client_address); + kfree(data->nfs_server.export_path); goto out; } #endif /* CONFIG_NFS_V4 */ -- cgit v1.2.3-70-g09d2 From d9e80b7de91db05c1c4d2e5ebbfd70b3b3ba0e0f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 29 Apr 2010 03:10:43 +0100 Subject: nfs d_revalidate() is too trigger-happy with d_drop() If dentry found stale happens to be a root of disconnected tree, we can't d_drop() it; its d_hash is actually part of s_anon and d_drop() would simply hide it from shrink_dcache_for_umount(), leading to all sorts of fun, including busy inodes on umount and oopsen after that. Bug had been there since at least 2006 (commit c636eb already has it), so it's definitely -stable fodder. Signed-off-by: Al Viro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/dir.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index be46f26c9a5..db3ad849a28 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -837,6 +837,8 @@ out_zap_parent: /* If we have submounts, don't unhash ! */ if (have_submounts(dentry)) goto out_valid; + if (dentry->d_flags & DCACHE_DISCONNECTED) + goto out_valid; shrink_dcache_parent(dentry); } d_drop(dentry); -- cgit v1.2.3-70-g09d2 From a36fed12a4d980eebb2e67b87ea30ad090238cff Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 29 Apr 2010 13:38:00 +0300 Subject: exofs: Fix "add bdi backing to mount session" fall out Commit b3d0ab7e60d1865bb6f6a79a77aaba22f2543236 ("exofs: add bdi backing to mount session") has a bug in the placement of the bdi member at struct exofs_sb_info. The layout member must be kept last. Signed-off-by: Boaz Harrosh Acked-by: Jens Axboe Signed-off-by: Linus Torvalds --- fs/exofs/exofs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 54373278a35..22721b2fd89 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -85,6 +85,7 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ + struct backing_dev_info bdi; /* register our bdi with VFS */ struct pnfs_osd_data_map data_map; /* Default raid to use * FIXME: Needed ? @@ -93,7 +94,6 @@ struct exofs_sb_info { struct exofs_layout layout; /* Default files layout, * contains the variable osd_dev * array. Keep last */ - struct backing_dev_info bdi; struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; -- cgit v1.2.3-70-g09d2 From f80a0ca6ad8f2800453e819dafa09a0ed9e56850 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 28 Apr 2010 14:36:41 +0200 Subject: pktcdvd: improve BKL and compat_ioctl.c usage The pktcdvd driver uses proper locking and does not need the BKL in the ioctl and llseek functions of the character device, so kill both. Moving the compat_ioctl handling from common code into the driver itself fixes build problems when CONFIG_BLOCK is disabled. Acked-by: Randy Dunlap Signed-off-by: Arnd Bergmann Signed-off-by: Linus Torvalds --- drivers/block/pktcdvd.c | 17 ++++++++++++++--- fs/compat_ioctl.c | 3 --- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index ddf19425245..8a549db2aa7 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -2984,7 +2985,7 @@ static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd) mutex_unlock(&ctl_mutex); } -static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg) +static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; struct pkt_ctrl_command ctrl_cmd; @@ -3021,10 +3022,20 @@ static int pkt_ctl_ioctl(struct inode *inode, struct file *file, unsigned int cm return ret; } +#ifdef CONFIG_COMPAT +static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); +} +#endif static const struct file_operations pkt_ctl_fops = { - .ioctl = pkt_ctl_ioctl, - .owner = THIS_MODULE, + .open = nonseekable_open, + .unlocked_ioctl = pkt_ctl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = pkt_ctl_compat_ioctl, +#endif + .owner = THIS_MODULE, }; static struct miscdevice pkt_misc = { diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c32a1b6a856..641640dc7ae 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -102,7 +102,6 @@ #include #include #include -#include #include @@ -1126,8 +1125,6 @@ COMPATIBLE_IOCTL(PPGETMODE) COMPATIBLE_IOCTL(PPGETPHASE) COMPATIBLE_IOCTL(PPGETFLAGS) COMPATIBLE_IOCTL(PPSETFLAGS) -/* pktcdvd */ -COMPATIBLE_IOCTL(PACKET_CTRL_CMD) /* Big A */ /* sparc only */ /* Big Q for sound/OSS */ -- cgit v1.2.3-70-g09d2 From 5477d0face8a3ba4e9a1e7283692fff9c92f8e5e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Apr 2010 20:33:35 +0200 Subject: fs: fs/super.c needs to include backing-dev.h for !CONFIG_BLOCK When CONFIG_BLOCK is set, it ends up getting backing-dev.h included. But for !CONFIG_BLOCK, it isn't so lucky. The proper thing to do is include directly from the file it's used from, so do that. Signed-off-by: Jens Axboe --- fs/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index dc72491a19f..1527e6a0ee3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include "internal.h" -- cgit v1.2.3-70-g09d2 From 3c2023dd8ed31e2ecfbb2d5aa20e8884d4b339e2 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Thu, 29 Apr 2010 20:35:29 +0200 Subject: exofs: Fix "add bdi backing to mount session" fall out The patch: add bdi backing to mount session (b3d0ab7e60d1865bb6f6a79a77aaba22f2543236) Has a bug in the placement of the bdi member at struct exofs_sb_info. The layout member must be kept last. Signed-off-by: Boaz Harrosh Signed-off-by: Jens Axboe --- fs/exofs/exofs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index 54373278a35..22721b2fd89 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -85,6 +85,7 @@ struct exofs_sb_info { u32 s_next_generation; /* next gen # to use */ atomic_t s_curr_pending; /* number of pending commands */ uint8_t s_cred[OSD_CAP_LEN]; /* credential for the fscb */ + struct backing_dev_info bdi; /* register our bdi with VFS */ struct pnfs_osd_data_map data_map; /* Default raid to use * FIXME: Needed ? @@ -93,7 +94,6 @@ struct exofs_sb_info { struct exofs_layout layout; /* Default files layout, * contains the variable osd_dev * array. Keep last */ - struct backing_dev_info bdi; struct osd_dev *_min_one_dev[1]; /* Place holder for one dev */ }; -- cgit v1.2.3-70-g09d2 From 9bf729c0af67897ea8498ce17c29b0683f7f2028 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Thu, 29 Apr 2010 09:55:50 +1000 Subject: xfs: add a shrinker to background inode reclaim On low memory boxes or those with highmem, kernel can OOM before the background reclaims inodes via xfssyncd. Add a shrinker to run inode reclaim so that it inode reclaim is expedited when memory is low. This is more complex than it needs to be because the VM folk don't want a context added to the shrinker infrastructure. Hence we need to add a global list of XFS mount structures so the shrinker can traverse them. Signed-off-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/xfs/linux-2.6/xfs_super.c | 5 ++ fs/xfs/linux-2.6/xfs_sync.c | 112 ++++++++++++++++++++++++++++++++++++++--- fs/xfs/linux-2.6/xfs_sync.h | 7 ++- fs/xfs/quota/xfs_qm_syscalls.c | 3 +- fs/xfs/xfs_ag.h | 1 + fs/xfs/xfs_mount.h | 1 + 6 files changed, 120 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 52e06b487ce..29f1edca76d 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1209,6 +1209,7 @@ xfs_fs_put_super( xfs_unmountfs(mp); xfs_freesb(mp); + xfs_inode_shrinker_unregister(mp); xfs_icsb_destroy_counters(mp); xfs_close_devices(mp); xfs_dmops_put(mp); @@ -1622,6 +1623,8 @@ xfs_fs_fill_super( if (error) goto fail_vnrele; + xfs_inode_shrinker_register(mp); + kfree(mtpt); return 0; @@ -1867,6 +1870,7 @@ init_xfs_fs(void) goto out_cleanup_procfs; vfs_initquota(); + xfs_inode_shrinker_init(); error = register_filesystem(&xfs_fs_type); if (error) @@ -1894,6 +1898,7 @@ exit_xfs_fs(void) { vfs_exitquota(); unregister_filesystem(&xfs_fs_type); + xfs_inode_shrinker_destroy(); xfs_sysctl_unregister(); xfs_cleanup_procfs(); xfs_buf_terminate(); diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index fd969821575..a427c638d90 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -95,7 +95,8 @@ xfs_inode_ag_walk( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { uint32_t first_index; int last_error = 0; @@ -134,7 +135,7 @@ restart: if (error == EFSCORRUPTED) break; - } while (1); + } while ((*nr_to_scan)--); if (skipped) { delay(1); @@ -150,12 +151,15 @@ xfs_inode_ag_iterator( struct xfs_perag *pag, int flags), int flags, int tag, - int exclusive) + int exclusive, + int *nr_to_scan) { int error = 0; int last_error = 0; xfs_agnumber_t ag; + int nr; + nr = nr_to_scan ? *nr_to_scan : INT_MAX; for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { struct xfs_perag *pag; @@ -165,14 +169,18 @@ xfs_inode_ag_iterator( continue; } error = xfs_inode_ag_walk(mp, pag, execute, flags, tag, - exclusive); + exclusive, &nr); xfs_perag_put(pag); if (error) { last_error = error; if (error == EFSCORRUPTED) break; } + if (nr <= 0) + break; } + if (nr_to_scan) + *nr_to_scan = nr; return XFS_ERROR(last_error); } @@ -291,7 +299,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, NULL); if (error) return XFS_ERROR(error); @@ -310,7 +318,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG, 0); + XFS_ICI_NO_TAG, 0, NULL); } STATIC int @@ -673,6 +681,7 @@ __xfs_inode_set_reclaim_tag( radix_tree_tag_set(&pag->pag_ici_root, XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable++; } /* @@ -705,6 +714,7 @@ __xfs_inode_clear_reclaim_tag( { radix_tree_tag_clear(&pag->pag_ici_root, XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + pag->pag_ici_reclaimable--; } /* @@ -854,5 +864,93 @@ xfs_reclaim_inodes( int mode) { return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, - XFS_ICI_RECLAIM_TAG, 1); + XFS_ICI_RECLAIM_TAG, 1, NULL); +} + +/* + * Shrinker infrastructure. + * + * This is all far more complex than it needs to be. It adds a global list of + * mounts because the shrinkers can only call a global context. We need to make + * the shrinkers pass a context to avoid the need for global state. + */ +static LIST_HEAD(xfs_mount_list); +static struct rw_semaphore xfs_mount_list_lock; + +static int +xfs_reclaim_inode_shrink( + int nr_to_scan, + gfp_t gfp_mask) +{ + struct xfs_mount *mp; + struct xfs_perag *pag; + xfs_agnumber_t ag; + int reclaimable = 0; + + if (nr_to_scan) { + if (!(gfp_mask & __GFP_FS)) + return -1; + + down_read(&xfs_mount_list_lock); + list_for_each_entry(mp, &xfs_mount_list, m_mplist) { + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, + XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); + if (nr_to_scan <= 0) + break; + } + up_read(&xfs_mount_list_lock); + } + + down_read(&xfs_mount_list_lock); + list_for_each_entry(mp, &xfs_mount_list, m_mplist) { + for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { + + pag = xfs_perag_get(mp, ag); + if (!pag->pag_ici_init) { + xfs_perag_put(pag); + continue; + } + reclaimable += pag->pag_ici_reclaimable; + xfs_perag_put(pag); + } + } + up_read(&xfs_mount_list_lock); + return reclaimable; +} + +static struct shrinker xfs_inode_shrinker = { + .shrink = xfs_reclaim_inode_shrink, + .seeks = DEFAULT_SEEKS, +}; + +void __init +xfs_inode_shrinker_init(void) +{ + init_rwsem(&xfs_mount_list_lock); + register_shrinker(&xfs_inode_shrinker); +} + +void +xfs_inode_shrinker_destroy(void) +{ + ASSERT(list_empty(&xfs_mount_list)); + unregister_shrinker(&xfs_inode_shrinker); +} + +void +xfs_inode_shrinker_register( + struct xfs_mount *mp) +{ + down_write(&xfs_mount_list_lock); + list_add_tail(&mp->m_mplist, &xfs_mount_list); + up_write(&xfs_mount_list_lock); +} + +void +xfs_inode_shrinker_unregister( + struct xfs_mount *mp) +{ + down_write(&xfs_mount_list_lock); + list_del(&mp->m_mplist); + up_write(&xfs_mount_list_lock); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index d480c346cab..cdcbaaca988 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -53,6 +53,11 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag, int write_lock); + int flags, int tag, int write_lock, int *nr_to_scan); + +void xfs_inode_shrinker_init(void); +void xfs_inode_shrinker_destroy(void); +void xfs_inode_shrinker_register(struct xfs_mount *mp); +void xfs_inode_shrinker_unregister(struct xfs_mount *mp); #endif diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 5d0ee8d492d..50bee07d6b0 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -891,7 +891,8 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, + XFS_ICI_NO_TAG, 0, NULL); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index b1a5a1ff88e..abb8222b88c 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -223,6 +223,7 @@ typedef struct xfs_perag { int pag_ici_init; /* incore inode cache initialised */ rwlock_t pag_ici_lock; /* incore inode lock */ struct radix_tree_root pag_ici_root; /* incore inode cache root */ + int pag_ici_reclaimable; /* reclaimable inodes */ #endif int pagb_count; /* pagb slots in use */ xfs_perag_busy_t pagb_list[XFS_PAGB_NUM_SLOTS]; /* unstable blocks */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 4fa0bc7b983..9ff48a16a7e 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -259,6 +259,7 @@ typedef struct xfs_mount { wait_queue_head_t m_wait_single_sync_task; __int64_t m_update_flags; /* sb flags we need to update on the next remount,rw */ + struct list_head m_mplist; /* inode shrinker mount list */ } xfs_mount_t; /* -- cgit v1.2.3-70-g09d2 From 12b1b321689cf92236fb216472744e39419fab30 Mon Sep 17 00:00:00 2001 From: Ralf Baechle Date: Mon, 8 Mar 2010 20:51:03 +0100 Subject: Inotify: Fix build failure in inotify user support CONFIG_INOTIFY_USER defined but CONFIG_ANON_INODES undefined will result in the following build failure: LD vmlinux fs/built-in.o: In function 'sys_inotify_init1': (.text.sys_inotify_init1+0x22c): undefined reference to 'anon_inode_getfd' fs/built-in.o: In function `sys_inotify_init1': (.text.sys_inotify_init1+0x22c): relocation truncated to fit: R_MIPS_26 against 'anon_inode_getfd' make[2]: *** [vmlinux] Error 1 make[1]: *** [sub-make] Error 2 make: *** [all] Error 2 Signed-off-by: Ralf Baechle Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/notify/inotify/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig index 3e56dbffe72..b3a159b21cf 100644 --- a/fs/notify/inotify/Kconfig +++ b/fs/notify/inotify/Kconfig @@ -15,6 +15,7 @@ config INOTIFY config INOTIFY_USER bool "Inotify support for userspace" + select ANON_INODES select FSNOTIFY default y ---help--- -- cgit v1.2.3-70-g09d2 From 6b933c8e6f1a2f3118082c455eef25f9b1ac7b45 Mon Sep 17 00:00:00 2001 From: Li Dongyang Date: Sat, 17 Apr 2010 17:49:10 +0800 Subject: ocfs2: Avoid direct write if we fall back to buffered I/O when we fall back to buffered write from direct write, we call __generic_file_aio_write() but that will end up doing direct write even we are only prepared to do buffered write because the file has the O_DIRECT flag set. This is a fix for https://bugzilla.novell.com/show_bug.cgi?id=591039 revised with Joel's comments. Signed-off-by: Li Dongyang Acked-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 20e0ee58dd3..a5fbd9cea96 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1982,18 +1982,18 @@ relock: /* communicate with ocfs2_dio_end_io */ ocfs2_iocb_set_rw_locked(iocb, rw_level); - if (direct_io) { - ret = generic_segment_checks(iov, &nr_segs, &ocount, - VERIFY_READ); - if (ret) - goto out_dio; + ret = generic_segment_checks(iov, &nr_segs, &ocount, + VERIFY_READ); + if (ret) + goto out_dio; - count = ocount; - ret = generic_write_checks(file, ppos, &count, - S_ISBLK(inode->i_mode)); - if (ret) - goto out_dio; + count = ocount; + ret = generic_write_checks(file, ppos, &count, + S_ISBLK(inode->i_mode)); + if (ret) + goto out_dio; + if (direct_io) { written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, ppos, count, ocount); if (written < 0) { @@ -2008,7 +2008,10 @@ relock: goto out_dio; } } else { - written = __generic_file_aio_write(iocb, iov, nr_segs, ppos); + current->backing_dev_info = file->f_mapping->backing_dev_info; + written = generic_file_buffered_write(iocb, iov, nr_segs, *ppos, + ppos, count, 0); + current->backing_dev_info = NULL; } out_dio: -- cgit v1.2.3-70-g09d2 From 973bec34bfc1bc2465646181653d67f767d418c8 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Mon, 3 May 2010 21:00:48 +0900 Subject: nilfs2: fix sync silent failure As of 32a88aa1, __sync_filesystem() will return 0 if s_bdi is not set. And nilfs does not set s_bdi anywhere. I noticed this problem by the warning introduced by the recent commit 5129a469 ("Catch filesystem lacking s_bdi"). WARNING: at fs/super.c:959 vfs_kern_mount+0xc5/0x14e() Hardware name: PowerEdge 2850 Modules linked in: nilfs2 loop tpm_tis tpm tpm_bios video shpchp pci_hotplug output dcdbas Pid: 3773, comm: mount.nilfs2 Not tainted 2.6.34-rc6-debug #38 Call Trace: [] warn_slowpath_common+0x60/0x90 [] warn_slowpath_null+0xd/0x10 [] vfs_kern_mount+0xc5/0x14e [] do_kern_mount+0x32/0xbd [] do_mount+0x671/0x6d0 [] ? __get_free_pages+0x1f/0x21 [] ? copy_mount_options+0x2b/0xe2 [] ? strndup_user+0x48/0x67 [] sys_mount+0x61/0x8f [] sysenter_do_call+0x12/0x32 This ensures to set s_bdi for nilfs and fixes the sync silent failure. Signed-off-by: Ryusuke Konishi Acked-by: Jens Axboe Signed-off-by: Linus Torvalds --- fs/nilfs2/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 0cdbc5e7655..48145f505a6 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -749,6 +749,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, sb->s_export_op = &nilfs_export_ops; sb->s_root = NULL; sb->s_time_gran = 1; + sb->s_bdi = nilfs->ns_bdi; err = load_nilfs(nilfs, sbi); if (err) -- cgit v1.2.3-70-g09d2 From c10f5e12bafde7f7a2f9b75d76f7a68d62154e91 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 16 Apr 2010 12:56:11 -0700 Subject: ceph: clear dir complete on d_move d_move() reorders the d_subdirs list, breaking the readdir result caching. Unless/until d_move preserves that ordering, clear CEPH_I_COMPLETE on rename. Signed-off-by: Sage Weil --- fs/ceph/inode.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 26f883c275e..261f3e6c0bc 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -997,6 +997,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, dn, dn->d_name.len, dn->d_name.name); dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + + /* d_move screws up d_subdirs order */ + ceph_i_clear(dir, CEPH_I_COMPLETE); + d_move(req->r_old_dentry, dn); dout(" src %p '%.*s' dst %p '%.*s'\n", req->r_old_dentry, -- cgit v1.2.3-70-g09d2 From 91dee39eebcfb47085c4d457a584b0e9723b6ca0 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Apr 2010 10:15:44 -0700 Subject: ceph: fix snap realm splits The snap realm split was checking i_snap_realm, not the list_head, to determine if an inode belonged in the new realm. The check always failed, which meant we always moved the inode, corrupting the old realm's list and causing various crashes. Also wait to release old realm reference to avoid possibility of use after free. Signed-off-by: Sage Weil --- fs/ceph/snap.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 2b881262ef6..d5114db7045 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -869,16 +869,20 @@ skip_inode: continue; ci = ceph_inode(inode); spin_lock(&inode->i_lock); - if (!ci->i_snap_realm) - goto split_skip_inode; - ceph_put_snap_realm(mdsc, ci->i_snap_realm); - spin_lock(&realm->inodes_with_caps_lock); - list_add(&ci->i_snap_realm_item, - &realm->inodes_with_caps); - ci->i_snap_realm = realm; - spin_unlock(&realm->inodes_with_caps_lock); - ceph_get_snap_realm(mdsc, realm); -split_skip_inode: + if (list_empty(&ci->i_snap_realm_item)) { + struct ceph_snap_realm *oldrealm = + ci->i_snap_realm; + + dout(" moving %p to split realm %llx %p\n", + inode, realm->ino, realm); + spin_lock(&realm->inodes_with_caps_lock); + list_add(&ci->i_snap_realm_item, + &realm->inodes_with_caps); + ci->i_snap_realm = realm; + spin_unlock(&realm->inodes_with_caps_lock); + ceph_get_snap_realm(mdsc, realm); + ceph_put_snap_realm(mdsc, oldrealm); + } spin_unlock(&inode->i_lock); iput(inode); } -- cgit v1.2.3-70-g09d2 From c8f16584ac85444d51d8753c5df502350cfc7bb7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Mon, 19 Apr 2010 13:50:26 -0700 Subject: ceph: print more useful version info on module load Decouple the client version from the server side. Print relevant protocol and map version info instead. Signed-off-by: Sage Weil --- fs/ceph/super.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 75d02eaa127..f888cf487b7 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -996,9 +996,10 @@ static int __init init_ceph(void) if (ret) goto out_icache; - pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n", - CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH, - CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL); + pr_info("loaded (mon/mds/osd proto %d/%d/%d, osdmap %d/%d %d/%d)\n", + CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL, + CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT, + CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT); return 0; out_icache: -- cgit v1.2.3-70-g09d2 From 0b0c06d1476290cea248923c0ee7be9fd61cacea Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 20 Apr 2010 10:27:13 -0700 Subject: ceph: fix leaked spinlock during mds reconnect Signed-off-by: Sage Weil --- fs/ceph/caps.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index aa2239fa9a3..0c168180686 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1861,8 +1861,8 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } else { pr_err("%p auth cap %p not mds%d ???\n", inode, cap, session->s_mds); - spin_unlock(&inode->i_lock); } + spin_unlock(&inode->i_lock); } } -- cgit v1.2.3-70-g09d2 From d45d0d970f495e04a4e4f46acd74e90f4a4564f9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 20 Apr 2010 15:20:33 -0700 Subject: ceph: add missing #includes Signed-off-by: Sage Weil --- fs/ceph/auth.c | 1 + fs/ceph/auth_none.h | 2 ++ fs/ceph/super.h | 1 + 3 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ceph/auth.c b/fs/ceph/auth.c index f6394b94b86..818afe72e6c 100644 --- a/fs/ceph/auth.c +++ b/fs/ceph/auth.c @@ -3,6 +3,7 @@ #include #include #include +#include #include "types.h" #include "auth_none.h" diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h index 56c05533a31..8164df1a08b 100644 --- a/fs/ceph/auth_none.h +++ b/fs/ceph/auth_none.h @@ -1,6 +1,8 @@ #ifndef _FS_CEPH_AUTH_NONE_H #define _FS_CEPH_AUTH_NONE_H +#include + #include "auth.h" /* diff --git a/fs/ceph/super.h b/fs/ceph/super.h index e30dfbb056c..13513b80d87 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include -- cgit v1.2.3-70-g09d2 From 684be25c52a1e43638ced160be0b0b46596e7f2b Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 21 Apr 2010 20:45:59 -0700 Subject: ceph: fix seq counting for skipped messages Increment in_seq even when the message is skipped for some reason. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index cdaaa131add..e7b91e093f5 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1379,6 +1379,7 @@ static int read_partial_message(struct ceph_connection *con) con->in_base_pos = -front_len - middle_len - data_len - sizeof(m->footer); con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; return 0; } if (IS_ERR(con->in_msg)) { @@ -2030,6 +2031,7 @@ void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg) ceph_msg_put(con->in_msg); con->in_msg = NULL; con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; } else { dout("con_revoke_pages %p msg %p pages %p no-op\n", con, con->in_msg, msg); -- cgit v1.2.3-70-g09d2 From ae18756b9fa7bb93132cff06cd8575e3d46633f9 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Apr 2010 07:47:01 -0700 Subject: ceph: discard incoming messages with bad seq # We can get old message seq #'s after a tcp reconnect for stateful sessions (i.e., the MDS). If we get a higher seq #, that is an error, and we shouldn't see any bad seq #'s for stateless (mon, osd) connections. Signed-off-by: Sage Weil --- fs/ceph/messenger.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c index e7b91e093f5..509f57d9ccb 100644 --- a/fs/ceph/messenger.c +++ b/fs/ceph/messenger.c @@ -1334,6 +1334,7 @@ static int read_partial_message(struct ceph_connection *con) unsigned front_len, middle_len, data_len, data_off; int datacrc = con->msgr->nocrc; int skip; + u64 seq; dout("read_partial_message con %p msg %p\n", con, m); @@ -1368,6 +1369,25 @@ static int read_partial_message(struct ceph_connection *con) return -EIO; data_off = le16_to_cpu(con->in_hdr.data_off); + /* verify seq# */ + seq = le64_to_cpu(con->in_hdr.seq); + if ((s64)seq - (s64)con->in_seq < 1) { + pr_info("skipping %s%lld %s seq %lld, expected %lld\n", + ENTITY_NAME(con->peer_name), + pr_addr(&con->peer_addr.in_addr), + seq, con->in_seq + 1); + con->in_base_pos = -front_len - middle_len - data_len - + sizeof(m->footer); + con->in_tag = CEPH_MSGR_TAG_READY; + con->in_seq++; + return 0; + } else if ((s64)seq - (s64)con->in_seq > 1) { + pr_err("read_partial_message bad seq %lld expected %lld\n", + seq, con->in_seq + 1); + con->error_msg = "bad message sequence # for incoming message"; + return -EBADMSG; + } + /* allocate message? */ if (!con->in_msg) { dout("got hdr type %d front %d data %d\n", con->in_hdr.type, -- cgit v1.2.3-70-g09d2 From 5c6a2cdb4fe8aaf6b54f022c14f13d2a12b45914 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 22 Apr 2010 13:48:59 -0700 Subject: ceph: fix direct io truncate offset truncate_inode_pages_range wants the end offset to align with the last byte in a page. Signed-off-by: Sage Weil --- fs/ceph/file.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 4add3d5da2c..ed6f19721d6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -665,7 +665,8 @@ more: * throw out any page cache pages in this range. this * may block. */ - truncate_inode_pages_range(inode->i_mapping, pos, pos+len); + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE-1)); } else { pages = alloc_page_vector(num_pages); if (IS_ERR(pages)) { -- cgit v1.2.3-70-g09d2 From ea1409f96197c1bffe5d7d5bc967b3445edcc1fa Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 28 Apr 2010 16:12:06 -0700 Subject: ceph: clear dir complete, invalidate dentry on replayed rename If a rename operation is resent to the MDS following an MDS restart, the client does not get a full reply (containing the resulting metadata) back. In that case, a ceph_rename() needs to compensate by doing anything useful that fill_inode() would have, like d_move(). It also needs to invalidate the dentry (to workaround the vfs_rename_dir() bug) and clear the dir complete flag, just like fill_trace(). Signed-off-by: Sage Weil --- fs/ceph/dir.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index ea8ee2e526a..650d2db5ed2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -880,7 +880,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * do_request, above). If there is no trace, we need * to do it here. */ + + /* d_move screws up d_subdirs order */ + ceph_i_clear(new_dir, CEPH_I_COMPLETE); + d_move(old_dentry, new_dentry); + + /* ensure target dentry is invalidated, despite + rehashing bug in vfs_rename_dir */ + new_dentry->d_time = jiffies; + ceph_dentry(new_dentry)->lease_shared_gen = 0; } ceph_mdsc_put_request(req); return err; -- cgit v1.2.3-70-g09d2 From 7ff899da02cb674211858fcd919f8b4511a4423f Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 23 Apr 2010 10:25:33 -0700 Subject: ceph: fix lockless caps check The __ variant requires caller to hold i_lock. Signed-off-by: Sage Weil --- fs/ceph/addr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 412593703d1..4b42c2bb603 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -509,7 +509,7 @@ static void writepages_finish(struct ceph_osd_request *req, u64 bytes = 0; struct ceph_client *client = ceph_inode_to_client(inode); long writeback_stat; - unsigned issued = __ceph_caps_issued(ci, NULL); + unsigned issued = ceph_caps_issued(ci); /* parse reply */ replyhead = msg->front.iov_base; -- cgit v1.2.3-70-g09d2 From b0930f8d38c6ab76dc8222a5a910a21392d38208 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 29 Apr 2010 13:26:53 -0700 Subject: ceph: remove bad auth_x kmem_cache It's useless, since our allocations are already a power of 2. And it was allocated per-instance (not globally), which caused a name collision when we tried to mount a second file system with auth_x enabled. Signed-off-by: Sage Weil --- fs/ceph/auth_x.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) (limited to 'fs') diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c index d9001a4dc8c..fee5a08da88 100644 --- a/fs/ceph/auth_x.c +++ b/fs/ceph/auth_x.c @@ -12,8 +12,6 @@ #include "auth.h" #include "decode.h" -struct kmem_cache *ceph_x_ticketbuf_cachep; - #define TEMP_TICKET_BUF_LEN 256 static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed); @@ -131,13 +129,12 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, char *ticket_buf; u8 struct_v; - dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC); + dbuf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!dbuf) return -ENOMEM; ret = -ENOMEM; - ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, - GFP_NOFS | GFP_ATOMIC); + ticket_buf = kmalloc(TEMP_TICKET_BUF_LEN, GFP_NOFS); if (!ticket_buf) goto out_dbuf; @@ -251,9 +248,9 @@ static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac, ret = 0; out: - kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf); + kfree(ticket_buf); out_dbuf: - kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf); + kfree(dbuf); return ret; bad: @@ -605,8 +602,6 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) remove_ticket_handler(ac, th); } - kmem_cache_destroy(ceph_x_ticketbuf_cachep); - kfree(ac->private); ac->private = NULL; } @@ -641,26 +636,20 @@ int ceph_x_init(struct ceph_auth_client *ac) int ret; dout("ceph_x_init %p\n", ac); + ret = -ENOMEM; xi = kzalloc(sizeof(*xi), GFP_NOFS); if (!xi) - return -ENOMEM; + goto out; - ret = -ENOMEM; - ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf", - TEMP_TICKET_BUF_LEN, 8, - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - NULL); - if (!ceph_x_ticketbuf_cachep) - goto done_nomem; ret = -EINVAL; if (!ac->secret) { pr_err("no secret set (for auth_x protocol)\n"); - goto done_nomem; + goto out_nomem; } ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret); if (ret) - goto done_nomem; + goto out_nomem; xi->starting = true; xi->ticket_handlers = RB_ROOT; @@ -670,10 +659,9 @@ int ceph_x_init(struct ceph_auth_client *ac) ac->ops = &ceph_x_ops; return 0; -done_nomem: +out_nomem: kfree(xi); - if (ceph_x_ticketbuf_cachep) - kmem_cache_destroy(ceph_x_ticketbuf_cachep); +out: return ret; } -- cgit v1.2.3-70-g09d2 From d577632e65ea01fb3b124b652d7bd2381251da3c Mon Sep 17 00:00:00 2001 From: Joel Becker Date: Mon, 3 May 2010 19:15:49 -0700 Subject: ocfs2: Avoid a gcc warning in ocfs2_wipe_inode(). gcc warns that a variable is uninitialized. It's actually handled, but an early return fools gcc. Let's just initialize the variable to a garbage value that will crash if the usage is ever broken. Signed-off-by: Joel Becker --- fs/ocfs2/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 83fe1d38f5c..af189887201 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -725,7 +725,7 @@ static void ocfs2_signal_wipe_completion(struct ocfs2_super *osb, static int ocfs2_wipe_inode(struct inode *inode, struct buffer_head *di_bh) { - int status, orphaned_slot; + int status, orphaned_slot = -1; struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); -- cgit v1.2.3-70-g09d2