From a07ef784356cf9157bd9bed5254cbb9a82d33722 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Dec 2012 14:52:05 +0900 Subject: f2fs: introduce f2fs_msg to ease adding information prints Introduced f2fs_msg function to differentiate f2fs specific messages in the log. And, added few informative prints in the mount path, to convey proper error in case of mount failure. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 ++ fs/f2fs/super.c | 80 +++++++++++++++++++++++++++++++++++++++++++++------------ 2 files changed, 65 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 13c6dfbb718..8199ee9f587 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -877,6 +877,8 @@ bool f2fs_empty_dir(struct inode *); * super.c */ int f2fs_sync_fs(struct super_block *, int); +extern __printf(3, 4) +void f2fs_msg(struct super_block *, const char *, const char *, ...); /* * hash.c diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 08a94c814bd..afa7ef0c4ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -53,6 +53,18 @@ static match_table_t f2fs_tokens = { {Opt_err, NULL}, }; +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + va_end(args); +} + static void init_once(void *foo) { struct f2fs_inode_info *fi = (struct f2fs_inode_info *) foo; @@ -247,7 +259,8 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static int parse_options(struct f2fs_sb_info *sbi, char *options) +static int parse_options(struct super_block *sb, struct f2fs_sb_info *sbi, + char *options) { substring_t args[MAX_OPT_ARGS]; char *p; @@ -286,7 +299,8 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_nouser_xattr: - pr_info("nouser_xattr options not supported\n"); + f2fs_msg(sb, KERN_INFO, + "nouser_xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -295,7 +309,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) break; #else case Opt_noacl: - pr_info("noacl options not supported\n"); + f2fs_msg(sb, KERN_INFO, "noacl options not supported"); break; #endif case Opt_active_logs: @@ -309,8 +323,9 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options) set_opt(sbi, DISABLE_EXT_IDENTIFY); break; default: - pr_err("Unrecognized mount option \"%s\" or missing value\n", - p); + f2fs_msg(sb, KERN_ERR, + "Unrecognized mount option \"%s\" or missing value", + p); return -EINVAL; } } @@ -337,23 +352,36 @@ static loff_t max_file_size(unsigned bits) return result; } -static int sanity_check_raw_super(struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct super_block *sb, + struct f2fs_super_block *raw_super) { unsigned int blocksize; - if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) + if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { + f2fs_msg(sb, KERN_INFO, + "Magic Mismatch, valid(0x%x) - read(0x%x)", + F2FS_SUPER_MAGIC, le32_to_cpu(raw_super->magic)); return 1; + } /* Currently, support only 4KB block size */ blocksize = 1 << le32_to_cpu(raw_super->log_blocksize); - if (blocksize != PAGE_CACHE_SIZE) + if (blocksize != PAGE_CACHE_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid blocksize (%u), supports only 4KB\n", + blocksize); return 1; + } if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) + F2FS_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); return 1; + } if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) + F2FS_LOG_SECTORS_PER_BLOCK) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); return 1; + } return 0; } @@ -414,13 +442,16 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; /* set a temporary block size */ - if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) + if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { + f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; + } /* read f2fs raw super block */ raw_super_buf = sb_bread(sb, 0); if (!raw_super_buf) { err = -EIO; + f2fs_msg(sb, KERN_ERR, "unable to read superblock"); goto free_sbi; } raw_super = (struct f2fs_super_block *) @@ -438,12 +469,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi, POSIX_ACL); #endif /* parse mount options */ - if (parse_options(sbi, (char *)data)) + if (parse_options(sb, sbi, (char *)data)) goto free_sb_buf; /* sanity checking of raw super */ - if (sanity_check_raw_super(raw_super)) + if (sanity_check_raw_super(sb, raw_super)) { + f2fs_msg(sb, KERN_ERR, "Can't find a valid F2FS filesystem"); goto free_sb_buf; + } sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); sb->s_max_links = F2FS_LINK_MAX; @@ -477,18 +510,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); goto free_sb_buf; } err = get_valid_checkpoint(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to get valid F2FS checkpoint"); goto free_meta_inode; + } /* sanity checking of checkpoint */ err = -EINVAL; - if (sanity_check_ckpt(raw_super, sbi->ckpt)) + if (sanity_check_ckpt(raw_super, sbi->ckpt)) { + f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); goto free_cp; + } sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); @@ -510,17 +548,24 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* setup f2fs internal modules */ err = build_segment_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS segment manager"); goto free_sm; + } err = build_node_manager(sbi); - if (err) + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS node manager"); goto free_nm; + } build_gc_manager(sbi); /* get an inode for node space */ sbi->node_inode = f2fs_iget(sb, F2FS_NODE_INO(sbi)); if (IS_ERR(sbi->node_inode)) { + f2fs_msg(sb, KERN_ERR, "Failed to read node inode"); err = PTR_ERR(sbi->node_inode); goto free_nm; } @@ -533,6 +578,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); if (IS_ERR(root)) { + f2fs_msg(sb, KERN_ERR, "Failed to read root inode"); err = PTR_ERR(root); goto free_node_inode; } -- cgit v1.2.3-70-g09d2 From 3af60a49fd2edfe9c5a06bc84d4832450895be96 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Dec 2012 14:52:37 +0900 Subject: f2fs: fix time update in case of f2fs fallocate After doing a punch hole or expanding inode doing fallocation. The change and modification time are not update for the file. So, update time after no issue is observed in fallocate. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7f9ea9271eb..88593c5e743 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -545,6 +545,11 @@ static long f2fs_fallocate(struct file *file, int mode, else ret = expand_inode_data(inode, offset, len, mode); + if (!ret) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); + } + f2fs_balance_fs(sbi); return ret; } -- cgit v1.2.3-70-g09d2 From 24c366a9ea256b86426b42e75f764495a2558861 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sun, 30 Dec 2012 14:53:08 +0900 Subject: f2fs: remove unneeded INIT_LIST_HEAD at few places While creating a new entry for addition to the list(orphan inode list and fsync inode entry list), there is no need to call HEAD initialization for these entries. So, remove that init part. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 1 - fs/f2fs/recovery.c | 1 - 2 files changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 6ef36c37e2b..d75c86a1789 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -214,7 +214,6 @@ retry: goto retry; } new->ino = ino; - INIT_LIST_HEAD(&new->list); /* add new_oentry into list which is sorted by inode number */ if (orphan) { diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index b571fee677d..502c63d8f09 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -151,7 +151,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto out; } - INIT_LIST_HEAD(&entry->list); list_add_tail(&entry->list, head); entry->blkaddr = blkaddr; } -- cgit v1.2.3-70-g09d2 From 7880ceedec55fbc3997d80e68670d03395225367 Mon Sep 17 00:00:00 2001 From: Huajun Li Date: Mon, 31 Dec 2012 13:59:09 +0800 Subject: f2fs: update f2fs partition info about SIT/NAT layout Update partition info output under debug FS to reflect segment layout correctly. Signed-off-by: Huajun Li Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 0e0380a588a..b8ed7a72c6e 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -190,8 +190,8 @@ static int stat_show(struct seq_file *s, void *v) update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); - seq_printf(s, "[SB: 1] [CP: 2] [NAT: %d] [SIT: %d] ", - si->nat_area_segs, si->sit_area_segs); + seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", + si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", -- cgit v1.2.3-70-g09d2 From d66d1f76878fcb1e78592fe8aecd13f438d6c0d7 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 08:57:21 +0900 Subject: f2fs: initialize newly allocated dnode structure This patch resolves Coverity #753112. In practical, the existing code flow does not fall into the reported errorneous path. But, anyway, let's avoid this for future. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8199ee9f587..280713289d8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -211,11 +211,11 @@ struct dnode_of_data { static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, struct page *ipage, struct page *npage, nid_t nid) { + memset(dn, 0, sizeof(*dn)); dn->inode = inode; dn->inode_page = ipage; dn->node_page = npage; dn->nid = nid; - dn->inode_page_locked = 0; } /* -- cgit v1.2.3-70-g09d2 From c1b75eabec4eddce55ebb078f84481f58272878f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 09:24:28 +0900 Subject: f2fs: avoid null dereference in f2fs_acl_from_disk This patch resolves Coverity #751303: >>> CID 753103: Explicit null dereferenced (FORWARD_NULL) Passing null >>> pointer "value" to function "f2fs_acl_from_disk(char const *, size_t)", which dereferences it. [Error path] - value = NULL; - retval = 0 by f2fs_getxattr(); - f2fs_acl_from_disk(value:NULL, ...); Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index e95b94945d5..137af4255da 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -191,15 +191,14 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) retval = f2fs_getxattr(inode, name_index, "", value, retval); } - if (retval < 0) { - if (retval == -ENODATA) - acl = NULL; - else - acl = ERR_PTR(retval); - } else { + if (retval > 0) acl = f2fs_acl_from_disk(value, retval); - } + else if (retval == -ENODATA) + acl = NULL; + else + acl = ERR_PTR(retval); kfree(value); + if (!IS_ERR(acl)) set_cached_acl(inode, type, acl); -- cgit v1.2.3-70-g09d2 From c335a86930b4841c11df12e1fdfd8345e0ebce84 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 09:33:20 +0900 Subject: f2fs: check return value during recovery This patch resolves Coverity #753102: >>> No check of the return value of "f2fs_add_link(&dent, inode)". Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 502c63d8f09..6cc046d3681 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -67,7 +67,7 @@ static int recover_dentry(struct page *ipage, struct inode *inode) kunmap(page); f2fs_put_page(page, 0); } else { - f2fs_add_link(&dent, inode); + err = f2fs_add_link(&dent, inode); } iput(dir); out: -- cgit v1.2.3-70-g09d2 From 408e9375610cca6d54e9c654cbe05a647687e12e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 3 Jan 2013 17:55:52 +0900 Subject: f2fs: revisit the f2fs_gc flow I'd like to revisit the f2fs_gc flow and rewrite as follows. 1. In practical, the nGC parameter of f2fs_gc is meaningless. So, let's remove it. 2. Background GC marks victim blocks as dirty one at a time. 3. Foreground GC should do cleaning job until acquiring enough free sections. Afterwards, it needs to do checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/gc.c | 60 +++++++++++++++++++------------------------------------ fs/f2fs/segment.c | 2 +- 3 files changed, 23 insertions(+), 41 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 280713289d8..285e43d602f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -986,7 +986,7 @@ int do_write_data_page(struct page *); int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); -int f2fs_gc(struct f2fs_sb_info *, int); +int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); int create_gc_caches(void); void destroy_gc_caches(void); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b0ec721e984..b4dd90cf1f1 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -78,7 +78,7 @@ static int gc_thread_func(void *data) sbi->bg_gc++; - if (f2fs_gc(sbi, 1) == GC_NONE) + if (f2fs_gc(sbi) == GC_NONE) wait_ms = GC_THREAD_NOGC_SLEEP_TIME; else if (wait_ms == GC_THREAD_NOGC_SLEEP_TIME) wait_ms = GC_THREAD_MAX_SLEEP_TIME; @@ -651,62 +651,44 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, return ret; } -int f2fs_gc(struct f2fs_sb_info *sbi, int nGC) +int f2fs_gc(struct f2fs_sb_info *sbi) { - unsigned int segno; - int old_free_secs, cur_free_secs; - int gc_status, nfree; struct list_head ilist; + unsigned int segno, i; int gc_type = BG_GC; + int gc_status = GC_NONE; INIT_LIST_HEAD(&ilist); gc_more: - nfree = 0; - gc_status = GC_NONE; + if (!(sbi->sb->s_flags & MS_ACTIVE)) + goto stop; if (has_not_enough_free_secs(sbi)) - old_free_secs = reserved_sections(sbi); - else - old_free_secs = free_sections(sbi); - - while (sbi->sb->s_flags & MS_ACTIVE) { - int i; - if (has_not_enough_free_secs(sbi)) - gc_type = FG_GC; + gc_type = FG_GC; - cur_free_secs = free_sections(sbi) + nfree; + if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + goto stop; - /* We got free space successfully. */ - if (nGC < cur_free_secs - old_free_secs) - break; - - if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) + for (i = 0; i < sbi->segs_per_sec; i++) { + /* + * do_garbage_collect will give us three gc_status: + * GC_ERROR, GC_DONE, and GC_BLOCKED. + * If GC is finished uncleanly, we have to return + * the victim to dirty segment list. + */ + gc_status = do_garbage_collect(sbi, segno + i, &ilist, gc_type); + if (gc_status != GC_DONE) break; - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * do_garbage_collect will give us three gc_status: - * GC_ERROR, GC_DONE, and GC_BLOCKED. - * If GC is finished uncleanly, we have to return - * the victim to dirty segment list. - */ - gc_status = do_garbage_collect(sbi, segno + i, - &ilist, gc_type); - if (gc_status != GC_DONE) - goto stop; - nfree++; - } } -stop: - if (has_not_enough_free_secs(sbi) || gc_status == GC_BLOCKED) { + if (has_not_enough_free_secs(sbi)) { write_checkpoint(sbi, (gc_status == GC_BLOCKED), false); - if (nfree) + if (has_not_enough_free_secs(sbi)) goto gc_more; } +stop: mutex_unlock(&sbi->gc_mutex); put_gc_inode(&ilist); - BUG_ON(!list_empty(&ilist)); return gc_status; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index de6240922b0..4b009906658 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -31,7 +31,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi) */ if (has_not_enough_free_secs(sbi)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, 1); + f2fs_gc(sbi); } } -- cgit v1.2.3-70-g09d2 From 7d82db83165dbac8c3f6d47b73c84f38e3996e30 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Jan 2013 13:10:49 +0900 Subject: f2fs: add f2fs_balance_fs in several interfaces The f2fs_balance_fs() is to check the number of free sections and decide whether it needs to conduct cleaning or not. If there are not enough free sections, the cleaning job should be started. In order to control an amount of free sections even under high utilization, f2fs should call f2fs_balance_fs at all the VFS interfaces that are able to produce dirty pages. This patch adds the function calls in the missing interfaces as follows. 1. f2fs_setxattr() The f2fs_setxattr() produces dirty node pages so that we should call f2fs_balance_fs() either likewise doing in other VFS interfaces such as f2fs_lookup(), f2fs_mkdir(), and so on. 2. f2fs_sync_file() We should guarantee serving free sections for syncing metadata during fsync. Previously, there is no space check before triggering checkpoint and sync_node_pages. Therefore, if a bunch of fsync calls are triggered under 100% of FS utilization, f2fs is able to be faced with no free sections, resulting in BUG_ON(). 3. f2fs_sync_fs() Before calling write_checkpoint(), we should guarantee that there are minimum free sections. 4. f2fs_write_inode() f2fs_write_inode() is also able to produce dirty node pages. Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ fs/f2fs/inode.c | 3 +++ fs/f2fs/super.c | 2 ++ fs/f2fs/xattr.c | 2 ++ 4 files changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 88593c5e743..7354c2df108 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -137,6 +137,9 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; + /* guarantee free sections for fsync */ + f2fs_balance_fs(sbi); + mutex_lock(&inode->i_mutex); if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bf20b4d0321..79424177732 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -217,6 +217,9 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; + if (wbc) + f2fs_balance_fs(sbi); + node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index afa7ef0c4ba..0f2b2eb86a0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -137,6 +137,8 @@ int f2fs_sync_fs(struct super_block *sb, int sync) if (sync) write_checkpoint(sbi, false, false); + else + f2fs_balance_fs(sbi); return 0; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 940136a3d3a..8038c049650 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -318,6 +318,8 @@ int f2fs_setxattr(struct inode *inode, int name_index, const char *name, if (name_len > 255 || value_len > MAX_VALUE_LEN) return -ERANGE; + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, NODE_NEW); if (!fi->i_xattr_nid) { /* Allocate new attribute block */ -- cgit v1.2.3-70-g09d2 From 9eaeba701386037cdd2ccd8bf8650feb2e2cec31 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Jan 2013 14:09:38 +0900 Subject: f2fs: move f2fs_balance_fs to punch_hole The f2fs_fallocate() has two operations: punch_hole and expand_size. Only in the case of punch_hole, dirty node pages can be produced, so let's trigger f2fs_balance_fs() in this case only. Furthermore, let's trigger it at every data truncation routine. Signed-off-by: Namjae Jeon Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7354c2df108..819de7f39f2 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -410,6 +410,8 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + f2fs_balance_fs(sbi); + mutex_lock_op(sbi, DATA_TRUNC); set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, index, RDONLY_NODE); @@ -537,7 +539,6 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file->f_path.dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); long ret; if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) @@ -552,8 +553,6 @@ static long f2fs_fallocate(struct file *file, int mode, inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } - - f2fs_balance_fs(sbi); return ret; } -- cgit v1.2.3-70-g09d2 From ff9234ad4e974768455071c91bd76402e4af8a28 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 12 Jan 2013 14:41:13 +0900 Subject: f2fs: remove redundant call to set_blocksize in f2fs_fill_super Since, f2fs supports only 4KB blocksize, which is set at the beginning in f2fs_fill_super. So, we do not need to again check this blocksize setting in such case. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 0f2b2eb86a0..ac127fde8e1 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -443,7 +443,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; - /* set a temporary block size */ + /* set a block size */ if (!sb_set_blocksize(sb, F2FS_BLKSIZE)) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; @@ -542,10 +542,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) INIT_LIST_HEAD(&sbi->dir_inode_list); spin_lock_init(&sbi->dir_inode_lock); - /* init super block */ - if (!sb_set_blocksize(sb, sbi->blocksize)) - goto free_cp; - init_orphan_info(sbi); /* setup f2fs internal modules */ -- cgit v1.2.3-70-g09d2 From 163799872b65b0cbf0091d82971233cc3d2425d3 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 12 Jan 2013 14:41:33 +0900 Subject: f2fs: avoid redundant time update for parent directory in f2fs_delete_entry In call to f2fs_delete_entry, 'dir' time modification code is put at two places. So, remove the redundant code for timing update. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 951ed52748f..989980e16d0 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -503,7 +503,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, } if (inode) { - inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; + inode->i_ctime = CURRENT_TIME; drop_nlink(inode); if (S_ISDIR(inode->i_mode)) { drop_nlink(inode); -- cgit v1.2.3-70-g09d2 From cfa7a9ccda711ac6ab8f0d17c3a9b540092d305a Mon Sep 17 00:00:00 2001 From: Tsutomu Itoh Date: Mon, 17 Dec 2012 06:38:51 +0000 Subject: Btrfs: fix memory leak in name_cache_insert() We should free name_cache_entry before returning from the error handling code. Signed-off-by: Tsutomu Itoh --- fs/btrfs/send.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 54454542ad4..321b7fb4e44 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -1814,8 +1814,10 @@ static int name_cache_insert(struct send_ctx *sctx, (unsigned long)nce->ino); if (!nce_head) { nce_head = kmalloc(sizeof(*nce_head), GFP_NOFS); - if (!nce_head) + if (!nce_head) { + kfree(nce); return -ENOMEM; + } INIT_LIST_HEAD(nce_head); ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head); -- cgit v1.2.3-70-g09d2 From cc975eb4605c5765a5d5e7a51d24ba5a1cda269e Mon Sep 17 00:00:00 2001 From: Lukas Czerner Date: Fri, 7 Dec 2012 10:09:19 +0000 Subject: btrfs: get the device in write mode when deleting it When we're deleting the device we should get it in write mode since we're going to re-write the super block magic on that device. And it should fail if the device is read-only. Signed-off-by: Lukas Czerner --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5cce6aa7401..86279c37de6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1431,7 +1431,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } } else { ret = btrfs_get_bdev_and_sb(device_path, - FMODE_READ | FMODE_EXCL, + FMODE_WRITE | FMODE_EXCL, root->fs_info->bdev_holder, 0, &bdev, &bh); if (ret) -- cgit v1.2.3-70-g09d2 From d86e56cf7d3669dd292012ac82b986bd1573b6cc Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Thu, 15 Nov 2012 11:35:41 +0000 Subject: Btrfs: disable qgroup id 0 Qgroup id 0 is a special number, we should set the id of a qgroup to 0. Fix it. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 7624212ae92..dd8e3448fe8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3698,6 +3698,11 @@ static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg) goto drop_write; } + if (!sa->qgroupid) { + ret = -EINVAL; + goto out; + } + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) { ret = PTR_ERR(trans); -- cgit v1.2.3-70-g09d2 From 5c39da5b6ca23e68e7acea7f4c01470383475214 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Mon, 22 Oct 2012 11:39:53 +0000 Subject: Btrfs: do not delete a subvolume which is in a R/O subvolume Step to reproduce: # mkfs.btrfs # mount # btrfs sub create /subv0 # btrfs sub snap /subv0/snap0 # change /subv0 from R/W to R/O # btrfs sub del /subv0/snap0 We deleted the snapshot successfully. I think we should not be able to delete the snapshot since the parent subvolume is R/O. Signed-off-by: Miao Xie --- fs/btrfs/ioctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index dd8e3448fe8..5a72896bd76 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2095,13 +2095,13 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, err = inode_permission(inode, MAY_WRITE | MAY_EXEC); if (err) goto out_dput; - - /* check if subvolume may be deleted by a non-root user */ - err = btrfs_may_delete(dir, dentry, 1); - if (err) - goto out_dput; } + /* check if subvolume may be deleted by a user */ + err = btrfs_may_delete(dir, dentry, 1); + if (err) + goto out_dput; + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) { err = -EINVAL; goto out_dput; -- cgit v1.2.3-70-g09d2 From dba60f3f5d564167118cad151a7d41dfe8d2a5f7 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 09:19:51 +0000 Subject: Btrfs: fix resize a readonly device We should not resize a readonly device, fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 5a72896bd76..0de21213d05 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1362,6 +1362,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, printk(KERN_INFO "btrfs: resizing devid %llu\n", (unsigned long long)devid); } + device = btrfs_find_device(root->fs_info, devid, NULL, NULL); if (!device) { printk(KERN_INFO "btrfs: resizer unable to find device %llu\n", @@ -1369,9 +1370,10 @@ static noinline int btrfs_ioctl_resize(struct file *file, ret = -EINVAL; goto out_free; } - if (device->fs_devices && device->fs_devices->seeding) { + + if (!device->writeable) { printk(KERN_INFO "btrfs: resizer unable to apply on " - "seeding device %llu\n", + "readonly device %llu\n", (unsigned long long)devid); ret = -EINVAL; goto out_free; -- cgit v1.2.3-70-g09d2 From 97547676570b3bd908560741315bf4b7d635bcf5 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Fri, 21 Dec 2012 10:38:50 +0000 Subject: Btrfs: fix missing write access release in btrfs_ioctl_resize() We forget to give up the write access after we find some device operation is going on. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0de21213d05..982c0b9ceea 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1339,6 +1339,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINPROGRESS; } -- cgit v1.2.3-70-g09d2 From 72bcd99d450cb1dde8bf13c3b65fc5883b2a3893 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 15:16:34 -0500 Subject: Btrfs: set flushing if we're limited flushing We still need to say we're flushing if we're limit flushing to keep somebody from coming in and stealing our reservation. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index d133edfcd44..61fefda74ff 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3997,7 +3997,7 @@ again: * We make the other tasks wait for the flush only when we can flush * all things. */ - if (ret && flush == BTRFS_RESERVE_FLUSH_ALL) { + if (ret && flush != BTRFS_RESERVE_NO_FLUSH) { flushing = true; space_info->flush = 1; } -- cgit v1.2.3-70-g09d2 From f3fe820c20a1a36c790545184e734e78d61cd68d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 7 Jan 2013 17:03:21 -0500 Subject: Btrfs: add orphan before truncating pagecache Running xfstests 83 in a loop would sometimes fail the fsck. This happens because if we invalidate a page that already has an ordered extent setup for it we will complete the ordered extent ourselves, assuming that the truncate will clean everything up. The problem with this is there is plenty of time for the truncate to fail after we've done this work. So to fix this we need to add the orphan item first to make sure the cleanup gets done properly, and then we can truncate the pagecache and all that stuff and be safe. This fixes the btrfsck failures I was seeing while running 83 in a loop. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 53 ++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 38 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 67ed24ae86b..4ddcf79e789 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2478,6 +2478,18 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) continue; } nr_truncate++; + + /* 1 for the orphan item deletion. */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + goto out; + } + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + goto out; + ret = btrfs_truncate(inode); } else { nr_unlink++; @@ -3783,9 +3795,34 @@ static int btrfs_setsize(struct inode *inode, loff_t newsize) set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, &BTRFS_I(inode)->runtime_flags); + /* + * 1 for the orphan item we're going to add + * 1 for the orphan item deletion. + */ + trans = btrfs_start_transaction(root, 2); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + /* + * We need to do this in case we fail at _any_ point during the + * actual truncate. Once we do the truncate_setsize we could + * invalidate pages which forces any outstanding ordered io to + * be instantly completed which will give us extents that need + * to be truncated. If we fail to get an orphan inode down we + * could have left over extents that were never meant to live, + * so we need to garuntee from this point on that everything + * will be consistent. + */ + ret = btrfs_orphan_add(trans, inode); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + /* we don't support swapfiles, so vmtruncate shouldn't fail */ truncate_setsize(inode, newsize); ret = btrfs_truncate(inode); + if (ret && inode->i_nlink) + btrfs_orphan_del(NULL, inode); } return ret; @@ -6929,11 +6966,9 @@ static int btrfs_truncate(struct inode *inode) /* * 1 for the truncate slack space - * 1 for the orphan item we're going to add - * 1 for the orphan item deletion * 1 for updating the inode. */ - trans = btrfs_start_transaction(root, 4); + trans = btrfs_start_transaction(root, 2); if (IS_ERR(trans)) { err = PTR_ERR(trans); goto out; @@ -6944,12 +6979,6 @@ static int btrfs_truncate(struct inode *inode) min_size); BUG_ON(ret); - ret = btrfs_orphan_add(trans, inode); - if (ret) { - btrfs_end_transaction(trans, root); - goto out; - } - /* * setattr is responsible for setting the ordered_data_close flag, * but that is only tested during the last file release. That @@ -7018,12 +7047,6 @@ static int btrfs_truncate(struct inode *inode) ret = btrfs_orphan_del(trans, inode); if (ret) err = ret; - } else if (ret && inode->i_nlink > 0) { - /* - * Failed to do the truncate, remove us from the in memory - * orphan list. - */ - ret = btrfs_orphan_del(NULL, inode); } if (trans) { -- cgit v1.2.3-70-g09d2 From ac5c93005b7073732e268606688fb6c821d5310e Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Thu, 27 Dec 2012 09:01:24 +0000 Subject: Btrfs: let allocation start from the right raid type This'd avoid us empty looping. Say we have only one disk and the metadata raid type will be defaultly DUP, and we do not need to start from index=0(RAID10) and get over two empty loops to index=2(DUP). Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 61fefda74ff..aeba53191ec 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5560,7 +5560,7 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, int empty_cluster = 2 * 1024 * 1024; struct btrfs_space_info *space_info; int loop = 0; - int index = 0; + int index = __get_raid_index(data); int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ? RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC; bool found_uncached_bg = false; -- cgit v1.2.3-70-g09d2 From 3268a2468eb6a31af89930cbae58a62fe6ca6d2d Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Fri, 28 Dec 2012 09:33:19 +0000 Subject: Btrfs: reset path lock state to zero We forgot to reset the path lock state to zero after we unlock the path block, and this can lead to the ASSERT checker in tree unlock API. Reported-by: Slava Barinov Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index aeba53191ec..85b8454d960 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -6788,11 +6788,13 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, &wc->flags[level]); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return ret; } BUG_ON(wc->refs[level] == 0); if (wc->refs[level] == 1) { btrfs_tree_unlock_rw(eb, path->locks[level]); + path->locks[level] = 0; return 1; } } -- cgit v1.2.3-70-g09d2 From 1214b53f90131fee1f950010c43e92455fe598ab Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 03:53:08 +0000 Subject: Btrfs: fix off-by-one in lseek Lock end is inclusive. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 20452c110d7..fa48051484b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2242,6 +2242,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) if (lockend <= lockstart) lockend = lockstart + root->sectorsize; + lockend--; len = lockend - lockstart + 1; len = max_t(u64, len, root->sectorsize); -- cgit v1.2.3-70-g09d2 From f9e4fb53938de5db01950c9dfe479703b2f5c964 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 7 Jan 2013 10:10:12 +0000 Subject: Btrfs: fix a bug when llseek for delalloc bytes behind prealloc extents xfstests case 285 complains. It it because btrfs did not try to find unwritten delalloc bytes(only dirty pages, not yet writeback) behind prealloc extents, it ends up finding nothing while we're with SEEK_DATA. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/file.c | 9 ++++++--- fs/btrfs/inode.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fa48051484b..841cfe3be0e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2309,9 +2309,12 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int origin) } } - *offset = start; - free_extent_map(em); - break; + if (!test_bit(EXTENT_FLAG_PREALLOC, + &em->flags)) { + *offset = start; + free_extent_map(em); + break; + } } } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4ddcf79e789..ac98384b174 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5623,10 +5623,13 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag return em; if (em) { /* - * if our em maps to a hole, there might - * actually be delalloc bytes behind it + * if our em maps to + * - a hole or + * - a pre-alloc extent, + * there might actually be delalloc bytes behind it. */ - if (em->block_start != EXTENT_MAP_HOLE) + if (em->block_start != EXTENT_MAP_HOLE && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) return em; else hole_em = em; @@ -5708,6 +5711,8 @@ struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *pag */ em->block_start = hole_em->block_start; em->block_len = hole_len; + if (test_bit(EXTENT_FLAG_PREALLOC, &hole_em->flags)) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); } else { em->start = range_start; em->len = found; -- cgit v1.2.3-70-g09d2 From f276795627045a3c599a60b476767861e4318c7d Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 8 Jan 2013 19:37:58 +0000 Subject: btrfs: fix btrfs_cont_expand() freeing IS_ERR em btrfs_cont_expand() tries to free an IS_ERR em as it gets an error from btrfs_get_extent() and breaks out of its loop. An instance of -EEXIST was reported in the wild: https://bugzilla.redhat.com/show_bug.cgi?id=874407 I have no idea if that -EEXIST is surprising, or not. Regardless, this error handling should be cleaned up to handle other reasonable errors (ENOMEM, EIO; whatever). This seemed to be the only buggy freeing of the relatively rare IS_ERR em so I opted to fix the caller rather than teach free_extent_map() to use IS_ERR_OR_NULL(). Signed-off-by: Zach Brown Reviewed-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ac98384b174..3d2c64d4734 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3677,6 +3677,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) block_end - cur_offset, 0); if (IS_ERR(em)) { err = PTR_ERR(em); + em = NULL; break; } last_byte = min(extent_map_end(em), block_end); -- cgit v1.2.3-70-g09d2 From 3972f2603d8570effaf633cea52b12c7c2773c11 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 12 Jan 2013 02:57:22 +0000 Subject: btrfs: update timestamps on truncate() truncate() vs. ftruncate() differ in the VFS; truncate() doesn't set (ATTR_CTIME | ATTR_MTIME), and it's up to the fs to do the timestamp updates if the size changes. Signed-off-by: Eric Sandeen Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d2c64d4734..9bc6c40b182 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,7 +88,7 @@ static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, }; -static int btrfs_setsize(struct inode *inode, loff_t newsize); +static int btrfs_setsize(struct inode *inode, struct iattr *attr); static int btrfs_truncate(struct inode *inode); static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent); static noinline int cow_file_range(struct inode *inode, @@ -3761,16 +3761,27 @@ next: return err; } -static int btrfs_setsize(struct inode *inode, loff_t newsize) +static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_trans_handle *trans; loff_t oldsize = i_size_read(inode); + loff_t newsize = attr->ia_size; + int mask = attr->ia_valid; int ret; if (newsize == oldsize) return 0; + /* + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME is a + * special case where we need to update the times despite not having + * these flags set. For all other operations the VFS set these flags + * explicitly if it wants a timestamp update. + */ + if (newsize != oldsize && (!(mask & (ATTR_CTIME | ATTR_MTIME)))) + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + if (newsize > oldsize) { truncate_pagecache(inode, oldsize, newsize); ret = btrfs_cont_expand(inode, oldsize, newsize); @@ -3843,7 +3854,7 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - err = btrfs_setsize(inode, attr->ia_size); + err = btrfs_setsize(inode, attr); if (err) return err; } -- cgit v1.2.3-70-g09d2 From fa9150a84ca333f68127097c4fa1eda4b3913a22 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 15 Jan 2013 16:45:24 +0900 Subject: f2fs: remove the blk_plug usage in f2fs_write_data_pages Let's consider the usage of blk_plug in f2fs_write_data_pages(). We can come up with the two issues: lock contention and task awareness. 1. Merging bios prior to grabing "queue lock" The f2fs merges consecutive IOs in the file system level before submitting any bios, which is similar with the back merge by the plugging mechanism in attempt_plug_merge(). Both of them need to acquire no queue lock. 2. Merging policy with respect to tasks The f2fs merges IOs as much as possible regardless of tasks, while blk-plugging is conducted on a basis of tasks. As we can understand there are trade-offs, f2fs tries to maximize the write performance with well-merged bios. As a result, if f2fs produces many consecutive but separated bios in writepages(), it would be good to use blk-plugging since f2fs would be able to avoid queue lock contention in the block layer by merging them. But, f2fs merges IOs and submit one bio, which means that there are not much chances to merge bios by attempt_plug_merge(). However, f2fs has already been used blk_plug by triggering generic_writepages() in f2fs_write_data_pages(). So to make the overall code consistency, I'd like to remove blk_plug there. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3aa5ce7cab8..b1347fc6d68 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -547,6 +547,15 @@ redirty_out: #define MAX_DESIRED_PAGES_WP 4096 +static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct address_space *mapping = data; + int ret = mapping->a_ops->writepage(page, wbc); + mapping_set_error(mapping, ret); + return ret; +} + static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -563,7 +572,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!S_ISDIR(inode->i_mode)) mutex_lock(&sbi->writepages); - ret = generic_writepages(mapping, wbc); + ret = write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); if (!S_ISDIR(inode->i_mode)) mutex_unlock(&sbi->writepages); f2fs_submit_bio(sbi, DATA, (wbc->sync_mode == WB_SYNC_ALL)); -- cgit v1.2.3-70-g09d2 From 66af62ce7588736ae65edfdb1c0df597775c4d21 Mon Sep 17 00:00:00 2001 From: majianpeng Date: Mon, 14 Jan 2013 20:08:16 +0800 Subject: f2fs: add global mutex_lock to protect f2fs_stat_list There is an race condition between umounting f2fs and reading f2fs/status, which results in oops. Fox example: Thread A Thread B umount f2fs cat f2fs/status f2fs_destroy_stats() { stat_show() { list_for_each_entry_safe(&f2fs_stat_list) list_del(&si->stat_list); mutex_lock(&si->stat_lock); si->sbi = NULL; mutex_unlock(&si->stat_lock); kfree(sbi->stat_info); } mutex_lock(&si->stat_lock) <- si is gone. ... } Solution with a global lock: f2fs_stat_mutex: Thread A Thread B umount f2fs cat f2fs/status f2fs_destroy_stats() { stat_show() { mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); mutex_unlock(&f2fs_stat_mutex); kfree(sbi->stat_info); mutex_lock(&f2fs_stat_mutex); } list_for_each_entry_safe(&f2fs_stat_list) ... mutex_unlock(&f2fs_stat_mutex); } Signed-off-by: Jianpeng Ma [jaegeuk.kim@samsung.com: fix typos, description, and remove the existing lock] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index b8ed7a72c6e..73f034a9418 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -26,6 +26,7 @@ static LIST_HEAD(f2fs_stat_list); static struct dentry *debugfs_root; +static DEFINE_MUTEX(f2fs_stat_mutex); static void update_general_status(struct f2fs_sb_info *sbi) { @@ -180,13 +181,9 @@ static int stat_show(struct seq_file *s, void *v) int i = 0; int j; + mutex_lock(&f2fs_stat_mutex); list_for_each_entry_safe(si, next, &f2fs_stat_list, stat_list) { - mutex_lock(&si->stat_lock); - if (!si->sbi) { - mutex_unlock(&si->stat_lock); - continue; - } update_general_status(si->sbi); seq_printf(s, "\n=====[ partition info. #%d ]=====\n", i++); @@ -286,8 +283,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, "\nMemory: %u KB = static: %u + cached: %u\n", (si->base_mem + si->cache_mem) >> 10, si->base_mem >> 10, si->cache_mem >> 10); - mutex_unlock(&si->stat_lock); } + mutex_unlock(&f2fs_stat_mutex); return 0; } @@ -313,9 +310,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return -ENOMEM; si = sbi->stat_info; - mutex_init(&si->stat_lock); - list_add_tail(&si->stat_list, &f2fs_stat_list); - si->all_area_segs = le32_to_cpu(raw_super->segment_count); si->sit_area_segs = le32_to_cpu(raw_super->segment_count_sit); si->nat_area_segs = le32_to_cpu(raw_super->segment_count_nat); @@ -325,6 +319,11 @@ static int init_stats(struct f2fs_sb_info *sbi) si->main_area_zones = si->main_area_sections / le32_to_cpu(raw_super->secs_per_zone); si->sbi = sbi; + + mutex_lock(&f2fs_stat_mutex); + list_add_tail(&si->stat_list, &f2fs_stat_list); + mutex_unlock(&f2fs_stat_mutex); + return 0; } @@ -347,10 +346,10 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; + mutex_lock(&f2fs_stat_mutex); list_del(&si->stat_list); - mutex_lock(&si->stat_lock); - si->sbi = NULL; - mutex_unlock(&si->stat_lock); + mutex_unlock(&f2fs_stat_mutex); + kfree(sbi->stat_info); } -- cgit v1.2.3-70-g09d2 From 4589d25d015c2d02bb5f7075d0cbf6dcf23a33c0 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 15 Jan 2013 19:58:47 +0900 Subject: f2fs: fix the debugfs entry creation path As the "status" debugfs entry will be maintained for entire F2FS filesystem irrespective of the number of partitions. So, we can move the initialization to the init part of the f2fs and destroy will be done from exit part. After making changes, for individual partition mount - entry creation code will not be executed. Signed-off-by: Jianpeng Ma Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 27 ++++++++++----------------- fs/f2fs/f2fs.h | 6 ++++-- fs/f2fs/super.c | 7 +++++-- 3 files changed, 19 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 73f034a9418..c8c37307b32 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -300,7 +300,7 @@ static const struct file_operations stat_fops = { .release = single_release, }; -static int init_stats(struct f2fs_sb_info *sbi) +int f2fs_build_stats(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_stat_info *si; @@ -327,21 +327,6 @@ static int init_stats(struct f2fs_sb_info *sbi) return 0; } -int f2fs_build_stats(struct f2fs_sb_info *sbi) -{ - int retval; - - retval = init_stats(sbi); - if (retval) - return retval; - - if (!debugfs_root) - debugfs_root = debugfs_create_dir("f2fs", NULL); - - debugfs_create_file("status", S_IRUGO, debugfs_root, NULL, &stat_fops); - return 0; -} - void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = sbi->stat_info; @@ -353,7 +338,15 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(sbi->stat_info); } -void destroy_root_stats(void) +void __init f2fs_create_root_stats(void) +{ + debugfs_root = debugfs_create_dir("f2fs", NULL); + if (debugfs_root) + debugfs_create_file("status", S_IRUGO, debugfs_root, + NULL, &stat_fops); +} + +void f2fs_destroy_root_stats(void) { debugfs_remove_recursive(debugfs_root); debugfs_root = NULL; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 285e43d602f..976325d51e3 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1060,7 +1060,8 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void destroy_root_stats(void); +void f2fs_create_root_stats(void); +void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) #define stat_inc_seg_count(si, type) @@ -1070,7 +1071,8 @@ void destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void destroy_root_stats(void) { } +static inline void f2fs_create_root_stats(void) { } +static inline void f2fs_destroy_root_stats(void) { } #endif extern const struct file_operations f2fs_dir_operations; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ac127fde8e1..d551a724b73 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -675,14 +675,17 @@ static int __init init_f2fs_fs(void) err = create_checkpoint_caches(); if (err) goto fail; - return register_filesystem(&f2fs_fs_type); + err = register_filesystem(&f2fs_fs_type); + if (err) + goto fail; + f2fs_create_root_stats(); fail: return err; } static void __exit exit_f2fs_fs(void) { - destroy_root_stats(); + f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); destroy_checkpoint_caches(); destroy_gc_caches(); -- cgit v1.2.3-70-g09d2 From 8ce03fd76d323526a693d05d85296ef07a387a9f Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sat, 17 Nov 2012 12:45:47 +0100 Subject: cuse: use mutex as registration lock instead of spinlocks We need to check for name-collisions during cuse-device registration. To avoid race-conditions, this needs to be protected during the whole device registration. Therefore, replace the spinlocks by mutexes first so we can safely extend the locked regions to include more expensive or sleeping code paths. Signed-off-by: David Herrmann Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index ee8d5504229..048e89f2508 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -45,7 +45,6 @@ #include #include #include -#include #include #include @@ -63,7 +62,7 @@ struct cuse_conn { bool unrestricted_ioctl; }; -static DEFINE_SPINLOCK(cuse_lock); /* protects cuse_conntbl */ +static DEFINE_MUTEX(cuse_lock); /* protects registration */ static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN]; static struct class *cuse_class; @@ -114,14 +113,14 @@ static int cuse_open(struct inode *inode, struct file *file) int rc; /* look up and get the connection */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_for_each_entry(pos, cuse_conntbl_head(devt), list) if (pos->dev->devt == devt) { fuse_conn_get(&pos->fc); cc = pos; break; } - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* dead? */ if (!cc) @@ -377,9 +376,9 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* announce device availability */ dev_set_uevent_suppress(dev, 0); @@ -520,9 +519,9 @@ static int cuse_channel_release(struct inode *inode, struct file *file) int rc; /* remove from the conntbl, no more access from this point on */ - spin_lock(&cuse_lock); + mutex_lock(&cuse_lock); list_del_init(&cc->list); - spin_unlock(&cuse_lock); + mutex_unlock(&cuse_lock); /* remove device */ if (cc->dev) -- cgit v1.2.3-70-g09d2 From 30783587b0f318b9e2e165f34cf5dfd9425a4904 Mon Sep 17 00:00:00 2001 From: David Herrmann Date: Sat, 17 Nov 2012 12:45:48 +0100 Subject: cuse: do not register multiple devices with identical names Sysfs doesn't allow two devices with the same name, but we register a sysfs entry for each cuse device without checking for name collisions. This extends the registration to first check whether the name was already registered. To avoid race-conditions between the name-check and linking the device, we need to protect the whole registration with a mutex. Signed-off-by: David Herrmann Acked-by: Tejun Heo Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 048e89f2508..2a2797e2abc 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -304,14 +304,14 @@ static void cuse_gendev_release(struct device *dev) */ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - struct cuse_conn *cc = fc_to_cc(fc); + struct cuse_conn *cc = fc_to_cc(fc), *pos; struct cuse_init_out *arg = req->out.args[0].value; struct page *page = req->pages[0]; struct cuse_devinfo devinfo = { }; struct device *dev; struct cdev *cdev; dev_t devt; - int rc; + int rc, i; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) { @@ -355,15 +355,24 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) dev_set_drvdata(dev, cc); dev_set_name(dev, "%s", devinfo.name); + mutex_lock(&cuse_lock); + + /* make sure the device-name is unique */ + for (i = 0; i < CUSE_CONNTBL_LEN; ++i) { + list_for_each_entry(pos, &cuse_conntbl[i], list) + if (!strcmp(dev_name(pos->dev), dev_name(dev))) + goto err_unlock; + } + rc = device_add(dev); if (rc) - goto err_device; + goto err_unlock; /* register cdev */ rc = -ENOMEM; cdev = cdev_alloc(); if (!cdev) - goto err_device; + goto err_unlock; cdev->owner = THIS_MODULE; cdev->ops = &cuse_frontend_fops; @@ -376,7 +385,6 @@ static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req) cc->cdev = cdev; /* make the device available */ - mutex_lock(&cuse_lock); list_add(&cc->list, cuse_conntbl_head(devt)); mutex_unlock(&cuse_lock); @@ -390,7 +398,8 @@ out: err_cdev: cdev_del(cdev); -err_device: +err_unlock: + mutex_unlock(&cuse_lock); put_device(dev); err_region: unregister_chrdev_region(devt, 1); -- cgit v1.2.3-70-g09d2 From e2560362cc2b39a0567cab510121a7e93dfbe797 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 15 Jan 2013 12:24:46 +0100 Subject: cuse: fix uninitialized variable warnings Fix the following compiler warnings: fs/fuse/cuse.c: In function 'cuse_process_init_reply': fs/fuse/cuse.c:288:24: warning: 'val' may be used uninitialized in this function [-Wmaybe-uninitialized] fs/fuse/cuse.c:272:14: note: 'val' was declared here fs/fuse/cuse.c:284:10: warning: 'key' may be used uninitialized in this function [-Wmaybe-uninitialized] fs/fuse/cuse.c:272:8: note: 'key' was declared here Signed-off-by: Miklos Szeredi --- fs/fuse/cuse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 2a2797e2abc..e397b675b02 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -266,7 +266,7 @@ static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp) static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo) { char *end = p + len; - char *key, *val; + char *uninitialized_var(key), *uninitialized_var(val); int rc; while (true) { -- cgit v1.2.3-70-g09d2 From 807185eb3e7271d7a1f2c08f3aa1b63dba547d07 Mon Sep 17 00:00:00 2001 From: "Robert P. J. Day" Date: Wed, 29 Aug 2012 17:51:51 -0400 Subject: fuse: Move CUSE Kconfig entry from fs/Kconfig into fs/fuse/Kconfig Given that CUSE depends on FUSE, it only makes sense to move its Kconfig entry into the FUSE Kconfig file. Also, add a few grammatical and semantic touchups. Signed-off-by: Robert P. J. Day Signed-off-by: Miklos Szeredi --- fs/Kconfig | 10 ---------- fs/fuse/Kconfig | 16 ++++++++++++++-- 2 files changed, 14 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/Kconfig b/fs/Kconfig index cfe512fd1ca..780725a463b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -68,16 +68,6 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" -config CUSE - tristate "Character device in Userspace support" - depends on FUSE_FS - help - This FUSE extension allows character devices to be - implemented in userspace. - - If you want to develop or use userspace character device - based on CUSE, answer Y or M. - config GENERIC_ACL bool select FS_POSIX_ACL diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index 0cf160a94ed..1b2f6c2c3aa 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -4,12 +4,24 @@ config FUSE_FS With FUSE it is possible to implement a fully functional filesystem in a userspace program. - There's also companion library: libfuse. This library along with - utilities is available from the FUSE homepage: + There's also a companion library: libfuse2. This library is available + from the FUSE homepage: + although chances are your distribution already has that library + installed if you've installed the "fuse" package itself. See for more information. See for needed library/utility version. If you want to develop a userspace FS, or if you want to use a filesystem based on FUSE, answer Y or M. + +config CUSE + tristate "Character device in Userspace support" + depends on FUSE_FS + help + This FUSE extension allows character devices to be + implemented in userspace. + + If you want to develop or use a userspace character device + based on CUSE, answer Y or M. -- cgit v1.2.3-70-g09d2 From cdadb11cef1802c1b0228976f08647d276711086 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Sat, 10 Nov 2012 16:55:56 +0100 Subject: fuse: make fuse_file_fallocate() static Fix the following sparse warning: fs/fuse/file.c:2249:6: warning: symbol 'fuse_file_fallocate' was not declared. Should it be static? Signed-off-by: Miklos Szeredi --- fs/fuse/file.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e21d4d8f87e..f3ab824fa30 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -2177,8 +2177,8 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, return ret; } -long fuse_file_fallocate(struct file *file, int mode, loff_t offset, - loff_t length) +static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, + loff_t length) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fc; @@ -2213,7 +2213,6 @@ long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return err; } -EXPORT_SYMBOL_GPL(fuse_file_fallocate); static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, -- cgit v1.2.3-70-g09d2 From 8f706111a860c026bcb0abe0c5936f59c31e5c87 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Thu, 18 Oct 2012 22:51:25 +0800 Subject: fuse: remove unused variable in fuse_try_move_page() The variables mapping,index are initialized but never used otherwise, so remove the unused variables. dpatch engine is used to auto generate this patch. (https://github.com/weiyj/dpatch) Signed-off-by: Wei Yongjun Signed-off-by: Miklos Szeredi --- fs/fuse/dev.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c16335315e5..e83351aa5ba 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -692,8 +692,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *oldpage = *pagep; struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - struct address_space *mapping; - pgoff_t index; unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); @@ -724,9 +722,6 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (fuse_check_page(newpage) != 0) goto out_fallback_unlock; - mapping = oldpage->mapping; - index = oldpage->index; - /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it -- cgit v1.2.3-70-g09d2 From ed0fb78fb6aa294a719f8f5654fdff0ec8bc00bc Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: bring back balance pause/resume logic Balance pause/resume logic got broken by 5ac00add (went in into 3.8-rc1 as part of dev-replace merge). Offending commit took a stab at making mutually exclusive volume operations (add_dev, rm_dev, resize, balance, replace_dev) not block behind volume_mutex if another such operation is in progress and instead return an error right away. Balancing front-end relied on the blocking behaviour, so the fix is ugly, but short of a complete rework, it's the best we can do. Reported-by: Liu Bo Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 78 ++++++++++++++++++++++++++++++++++++++++++++---------- fs/btrfs/volumes.c | 10 ++++--- 2 files changed, 71 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 982c0b9ceea..77d8273e394 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3440,8 +3440,8 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_ioctl_balance_args *bargs; struct btrfs_balance_control *bctl; + bool need_unlock; /* for mut. excl. ops lock */ int ret; - int need_to_clear_lock = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -3450,14 +3450,61 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&fs_info->volume_mutex); +again: + if (!atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)) { + mutex_lock(&fs_info->volume_mutex); + mutex_lock(&fs_info->balance_mutex); + need_unlock = true; + goto locked; + } + + /* + * mut. excl. ops lock is locked. Three possibilites: + * (1) some other op is running + * (2) balance is running + * (3) balance is paused -- special case (think resume) + */ mutex_lock(&fs_info->balance_mutex); + if (fs_info->balance_ctl) { + /* this is either (2) or (3) */ + if (!atomic_read(&fs_info->balance_running)) { + mutex_unlock(&fs_info->balance_mutex); + if (!mutex_trylock(&fs_info->volume_mutex)) + goto again; + mutex_lock(&fs_info->balance_mutex); + + if (fs_info->balance_ctl && + !atomic_read(&fs_info->balance_running)) { + /* this is (3) */ + need_unlock = false; + goto locked; + } + + mutex_unlock(&fs_info->balance_mutex); + mutex_unlock(&fs_info->volume_mutex); + goto again; + } else { + /* this is (2) */ + mutex_unlock(&fs_info->balance_mutex); + ret = -EINPROGRESS; + goto out; + } + } else { + /* this is (1) */ + mutex_unlock(&fs_info->balance_mutex); + pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + ret = -EINVAL; + goto out; + } + +locked: + BUG_ON(!atomic_read(&fs_info->mutually_exclusive_operation_running)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); if (IS_ERR(bargs)) { ret = PTR_ERR(bargs); - goto out; + goto out_unlock; } if (bargs->flags & BTRFS_BALANCE_RESUME) { @@ -3477,13 +3524,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) bargs = NULL; } - if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, - 1)) { - pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + if (fs_info->balance_ctl) { ret = -EINPROGRESS; goto out_bargs; } - need_to_clear_lock = 1; bctl = kzalloc(sizeof(*bctl), GFP_NOFS); if (!bctl) { @@ -3504,11 +3548,17 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } do_balance: - ret = btrfs_balance(bctl, bargs); /* - * bctl is freed in __cancel_balance or in free_fs_info if - * restriper was paused all the way until unmount + * Ownership of bctl and mutually_exclusive_operation_running + * goes to to btrfs_balance. bctl is freed in __cancel_balance, + * or, if restriper was paused all the way until unmount, in + * free_fs_info. mutually_exclusive_operation_running is + * cleared in __cancel_balance. */ + need_unlock = false; + + ret = btrfs_balance(bctl, bargs); + if (arg) { if (copy_to_user(arg, bargs, sizeof(*bargs))) ret = -EFAULT; @@ -3516,12 +3566,12 @@ do_balance: out_bargs: kfree(bargs); -out: - if (need_to_clear_lock) - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); +out_unlock: mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); + if (need_unlock) + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); +out: mnt_drop_write_file(file); return ret; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 86279c37de6..9c84dbe64f1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2959,6 +2959,8 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info) unset_balance_control(fs_info); ret = del_balance_item(fs_info->tree_root); BUG_ON(ret); + + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); } void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, @@ -3138,8 +3140,10 @@ int btrfs_balance(struct btrfs_balance_control *bctl, out: if (bctl->flags & BTRFS_BALANCE_RESUME) __cancel_balance(fs_info); - else + else { kfree(bctl); + atomic_set(&fs_info->mutually_exclusive_operation_running, 0); + } return ret; } @@ -3156,7 +3160,6 @@ static int balance_kthread(void *data) ret = btrfs_balance(fs_info->balance_ctl, NULL); } - atomic_set(&fs_info->mutually_exclusive_operation_running, 0); mutex_unlock(&fs_info->balance_mutex); mutex_unlock(&fs_info->volume_mutex); @@ -3179,7 +3182,6 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) return 0; } - WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance"); if (IS_ERR(tsk)) return PTR_ERR(tsk); @@ -3233,6 +3235,8 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) btrfs_balance_sys(leaf, item, &disk_bargs); btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs); + WARN_ON(atomic_xchg(&fs_info->mutually_exclusive_operation_running, 1)); + mutex_lock(&fs_info->volume_mutex); mutex_lock(&fs_info->balance_mutex); -- cgit v1.2.3-70-g09d2 From 2c0c9da02a2c4289350da6e54202a86602c0f926 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix "mutually exclusive op is running" error code The error code that is returned in response to starting a mutually exclusive operation when there is one already running got silently changed from EINVAL to EINPROGRESS by 5ac00add. Returning EINPROGRESS to, say, add_dev, when rm_dev is running is misleading. Furthermore, the operation itself may want to use EINPROGRESS for other purposes. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 77d8273e394..259dd52d878 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1340,7 +1340,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2192,7 +2192,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } ret = mnt_want_write_file(file); if (ret) { @@ -2266,7 +2266,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); @@ -2303,7 +2303,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); mnt_drop_write_file(file); - return -EINPROGRESS; + return -EINVAL; } mutex_lock(&root->fs_info->volume_mutex); -- cgit v1.2.3-70-g09d2 From 18f39c416d18d74ac11d157e44247253d3fa30ae Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix unlock order in btrfs_ioctl_resize Fix unlock order in btrfs_ioctl_resize(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 259dd52d878..fcf1b1b4008 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1446,8 +1446,8 @@ out_free: kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3-70-g09d2 From 4ac20c70b0734b65662ded735e5f6ba0415bdb71 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: fix unlock order in btrfs_ioctl_rm_dev Fix unlock order in btrfs_ioctl_rm_dev(). Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fcf1b1b4008..f5c1c150d9f 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2319,8 +2319,8 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3-70-g09d2 From 25122d15e21cf252e91e4cad7cea760f97df29f1 Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Sun, 20 Jan 2013 15:57:57 +0200 Subject: Btrfs: reorder locks and sanity checks in btrfs_ioctl_defrag Operation-specific check (whether subvol is readonly or not) should go after the mutual exclusiveness check. Signed-off-by: Ilya Dryomov --- fs/btrfs/ioctl.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f5c1c150d9f..afbf3ac2079 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2186,19 +2186,20 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) struct btrfs_ioctl_defrag_range_args *range; int ret; - if (btrfs_root_readonly(root)) - return -EROFS; + ret = mnt_want_write_file(file); + if (ret) + return ret; if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running, 1)) { pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n"); + mnt_drop_write_file(file); return -EINVAL; } - ret = mnt_want_write_file(file); - if (ret) { - atomic_set(&root->fs_info->mutually_exclusive_operation_running, - 0); - return ret; + + if (btrfs_root_readonly(root)) { + ret = -EROFS; + goto out; } switch (inode->i_mode & S_IFMT) { @@ -2250,8 +2251,8 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) ret = -EINVAL; } out: - mnt_drop_write_file(file); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + mnt_drop_write_file(file); return ret; } -- cgit v1.2.3-70-g09d2 From e3e2775cedc9d6294b7bc7cbe9f59c62f9472871 Mon Sep 17 00:00:00 2001 From: Nickolai Zeldovich Date: Wed, 16 Jan 2013 21:36:17 -0500 Subject: cifs: fix srcip_matches() for ipv6 srcip_matches() previously had code like this: srcip_matches(..., struct sockaddr *rhs) { /* ... */ struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *) &rhs; return ipv6_addr_equal(..., &vaddr6->sin6_addr); } which interpreted the values on the stack after the 'rhs' pointer as an ipv6 address. The correct thing to do is to use 'rhs', not '&rhs'. Signed-off-by: Nickolai Zeldovich Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 17c3643e595..12b3da39733 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1917,7 +1917,7 @@ srcip_matches(struct sockaddr *srcaddr, struct sockaddr *rhs) } case AF_INET6: { struct sockaddr_in6 *saddr6 = (struct sockaddr_in6 *)srcaddr; - struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)&rhs; + struct sockaddr_in6 *vaddr6 = (struct sockaddr_in6 *)rhs; return ipv6_addr_equal(&saddr6->sin6_addr, &vaddr6->sin6_addr); } default: -- cgit v1.2.3-70-g09d2 From ff24858c65d9c518af41aad22fb964685351051a Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:08 -0700 Subject: Btrfs: ignore orphan qgroup relations If a qgroup that has still assignments is deleted by the user, the corresponding relations are left in the tree. This leads to an unmountable filesystem. With this patch, those relations are simple ignored. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index fe9d02c45f8..28f2b39f6a2 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -379,6 +379,13 @@ next1: ret = add_relation_rb(fs_info, found_key.objectid, found_key.offset); + if (ret == -ENOENT) { + printk(KERN_WARNING + "btrfs: orphan qgroup relation 0x%llx->0x%llx\n", + (unsigned long long)found_key.objectid, + (unsigned long long)found_key.offset); + ret = 0; /* ignore the error */ + } if (ret) goto out; next2: -- cgit v1.2.3-70-g09d2 From 2cf687039676c2b6e1ee96b0b89090aca94babcd Mon Sep 17 00:00:00 2001 From: Arne Jansen Date: Thu, 17 Jan 2013 01:22:09 -0700 Subject: Btrfs: prevent qgroup destroy when there are still relations Currently you can just destroy a qgroup even though it is in use by other qgroups or has qgroups assigned to it. This patch prevents destruction of qgroups unless they are completely unused. Otherwise destroy will return EBUSY. Reported-by: Eric Hopper Signed-off-by: Arne Jansen Signed-off-by: Chris Mason --- fs/btrfs/qgroup.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 28f2b39f6a2..a5c85623432 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -963,17 +963,28 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 qgroupid) { struct btrfs_root *quota_root; + struct btrfs_qgroup *qgroup; int ret = 0; quota_root = fs_info->quota_root; if (!quota_root) return -EINVAL; + /* check if there are no relations to this qgroup */ + spin_lock(&fs_info->qgroup_lock); + qgroup = find_qgroup_rb(fs_info, qgroupid); + if (qgroup) { + if (!list_empty(&qgroup->groups) || !list_empty(&qgroup->members)) { + spin_unlock(&fs_info->qgroup_lock); + return -EBUSY; + } + } + spin_unlock(&fs_info->qgroup_lock); + ret = del_qgroup_item(trans, quota_root, qgroupid); spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(quota_root->fs_info, qgroupid); - spin_unlock(&fs_info->qgroup_lock); return ret; -- cgit v1.2.3-70-g09d2 From a105bb88f46b60de2adf1ee98745bd59362b09ab Mon Sep 17 00:00:00 2001 From: Ilya Dryomov Date: Mon, 21 Jan 2013 15:15:56 +0200 Subject: Btrfs: fix a regression in balance usage filter Commit 3fed40cc ("Btrfs: cleanup duplicated division functions"), which was merged into 3.8-rc1, has introduced a regression by removing logic that was guarding us against bad user input. Bring it back. Signed-off-by: Ilya Dryomov Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c84dbe64f1..46960983891 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2614,7 +2614,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = btrfs_block_group_used(&cache->item); - user_thresh = div_factor_fine(cache->key.offset, bargs->usage); + if (bargs->usage == 0) + user_thresh = 0; + else if (bargs->usage > 100) + user_thresh = cache->key.offset; + else + user_thresh = div_factor_fine(cache->key.offset, + bargs->usage); + if (chunk_used < user_thresh) ret = 0; -- cgit v1.2.3-70-g09d2 From 6e6093a8f144414d904575da5fdea40cf14fb63e Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 17 Jan 2013 00:08:30 +0900 Subject: f2fs: add __init to functions in init_f2fs_fs Add __init to functions in init_f2fs_fs for code consistency. Signed-off-by: Namjae Jeon Signed-off-by: Amit Sahrawat Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/gc.c | 2 +- fs/f2fs/node.c | 2 +- fs/f2fs/super.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d75c86a1789..ff3c8439af8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -771,7 +771,7 @@ void init_orphan_info(struct f2fs_sb_info *sbi) sbi->n_orphans = 0; } -int create_checkpoint_caches(void) +int __init create_checkpoint_caches(void) { orphan_entry_slab = f2fs_kmem_cache_create("f2fs_orphan_entry", sizeof(struct orphan_inode_entry), NULL); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 976325d51e3..c8e2d751ef9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -914,7 +914,7 @@ int restore_node_summary(struct f2fs_sb_info *, unsigned int, void flush_nat_entries(struct f2fs_sb_info *); int build_node_manager(struct f2fs_sb_info *); void destroy_node_manager(struct f2fs_sb_info *); -int create_node_manager_caches(void); +int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* @@ -966,7 +966,7 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *); void block_operations(struct f2fs_sb_info *); void write_checkpoint(struct f2fs_sb_info *, bool, bool); void init_orphan_info(struct f2fs_sb_info *); -int create_checkpoint_caches(void); +int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* @@ -988,7 +988,7 @@ void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int); int f2fs_gc(struct f2fs_sb_info *); void build_gc_manager(struct f2fs_sb_info *); -int create_gc_caches(void); +int __init create_gc_caches(void); void destroy_gc_caches(void); /* @@ -1060,7 +1060,7 @@ struct f2fs_stat_info { int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void f2fs_create_root_stats(void); +void __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_call_count(si) @@ -1071,7 +1071,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void f2fs_create_root_stats(void) { } +static inline void __init f2fs_create_root_stats(void) { } static inline void f2fs_destroy_root_stats(void) { } #endif diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index b4dd90cf1f1..809cfec6683 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -697,7 +697,7 @@ void build_gc_manager(struct f2fs_sb_info *sbi) DIRTY_I(sbi)->v_ops = &default_v_ops; } -int create_gc_caches(void) +int __init create_gc_caches(void) { winode_slab = f2fs_kmem_cache_create("f2fs_gc_inodes", sizeof(struct inode_entry), NULL); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5066bfd256c..f177c018745 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1732,7 +1732,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) kfree(nm_i); } -int create_node_manager_caches(void) +int __init create_node_manager_caches(void) { nat_entry_slab = f2fs_kmem_cache_create("nat_entry", sizeof(struct nat_entry), NULL); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d551a724b73..37fad04c866 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -640,7 +640,7 @@ static struct file_system_type f2fs_fs_type = { .fs_flags = FS_REQUIRES_DEV, }; -static int init_inodecache(void) +static int __init init_inodecache(void) { f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", sizeof(struct f2fs_inode_info), NULL); -- cgit v1.2.3-70-g09d2 From 692bb55d1ab5b278181ff2e65f09eb0be6d50669 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 17 Jan 2013 18:37:41 +0900 Subject: f2fs: add remap_pages as generic_file_remap_pages This was added for all the file systems before. See the following commit. commit id: 0b173bc4daa8f8ec03a85abf5e47b23502ff80af [PATCH] mm: kill vma flag VM_CAN_NONLINEAR This patch moves actual ptes filling for non-linear file mappings into special vma operation: ->remap_pages(). File system must implement this method to get non-linear mappings support, if it uses filemap_fault() then generic_file_remap_pages() can be used. Now device drivers can implement this method and obtain nonlinear vma support." Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 819de7f39f2..3191b52aafb 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -96,8 +96,9 @@ out: } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, - .page_mkwrite = f2fs_vm_page_mkwrite, + .fault = filemap_fault, + .page_mkwrite = f2fs_vm_page_mkwrite, + .remap_pages = generic_file_remap_pages, }; static int need_to_sync_dir(struct f2fs_sb_info *sbi, struct inode *inode) -- cgit v1.2.3-70-g09d2 From c01e54b770e69c65525295eb2668be3dc0822406 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 17 Jan 2013 20:30:23 +0900 Subject: f2fs: support swapfile This patch adds f2fs_bmap operation to the data address space. This enables f2fs to support swapfile. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index b1347fc6d68..7bd22a20112 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -698,6 +698,11 @@ static int f2fs_set_data_page_dirty(struct page *page) return 0; } +static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) +{ + return generic_block_bmap(mapping, block, get_data_block_ro); +} + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -709,4 +714,5 @@ const struct address_space_operations f2fs_dblock_aops = { .invalidatepage = f2fs_invalidate_data_page, .releasepage = f2fs_release_data_page, .direct_IO = f2fs_direct_IO, + .bmap = f2fs_bmap, }; -- cgit v1.2.3-70-g09d2 From a7fdffbd3ea4b3cc2993af006bde38a423b38b72 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 18 Jan 2013 14:54:13 +0900 Subject: f2fs: avoid issuing small bios due to several dirty node pages If some small bios of dirty node pages are supposed to be issued during the sequential data writes, there-in well-produced consecutive data bios are able to be split by the small node bios, resulting in performance degradation. So, let's collect a number of dirty node pages until reaching a threshold. And, by default, I set the threshold as 2MB, a segment size. This improves sequential write performance on i5, 512GB SSD (830 w/ SATA2) as follows. Before: 231 MB/s -> After: 255 MB/s Signed-off-by: Jaegeuk Kim Reviewed-by: Namjae Jeon --- fs/f2fs/node.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f177c018745..9bda63c9c16 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1124,6 +1124,12 @@ static int f2fs_write_node_page(struct page *page, return 0; } +/* + * It is very important to gather dirty pages and write at once, so that we can + * submit a big bio without interfering other data writes. + * Be default, 512 pages (2MB), a segment size, is quite reasonable. + */ +#define COLLECT_DIRTY_NODES 512 static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1131,17 +1137,16 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct block_device *bdev = sbi->sb->s_bdev; long nr_to_write = wbc->nr_to_write; - if (wbc->for_kupdate) - return 0; - - if (get_pages(sbi, F2FS_DIRTY_NODES) == 0) - return 0; - + /* First check balancing cached NAT entries */ if (try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK)) { write_checkpoint(sbi, false, false); return 0; } + /* collect a number of dirty node pages and write together */ + if (get_pages(sbi, F2FS_DIRTY_NODES) < COLLECT_DIRTY_NODES) + return 0; + /* if mounting is failed, skip writing node pages */ wbc->nr_to_write = bio_get_nr_vecs(bdev); sync_node_pages(sbi, 0, wbc); -- cgit v1.2.3-70-g09d2 From 9af45ef5ab8ce4a13c553200dc15509441fbd68f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 21 Jan 2013 17:34:21 +0900 Subject: f2fs: add comments of start_bidx_of_node The caller of start_bidx_of_node() should give proper node offsets which point only direct node blocks. Otherwise, it is a caller's bug. This patch adds comments to make it clear. Reported-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 809cfec6683..c386910dacc 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -424,7 +424,11 @@ next_step: } /* - * Calculate start block index that this node page contains + * Calculate start block index indicating the given node offset. + * Be careful, caller should give this node offset only indicating direct node + * blocks. If any node offsets, which point the other types of node blocks such + * as indirect or double indirect node blocks, are given, it must be a caller's + * bug. */ block_t start_bidx_of_node(unsigned int node_ofs) { -- cgit v1.2.3-70-g09d2 From d8b79b2f94600262fcfbffbe3df7fd3c83c6c51b Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 20 Jan 2013 18:02:58 +0300 Subject: f2fs: use _safe() version of list_for_each This is calling list_del() inside a loop which is a problem when we try move to the next item on the list. I've converted it to use the _safe version. And also, as a cleanup, I've converted it to use list_for_each_entry instead of list_for_each. Signed-off-by: Dan Carpenter Reviewed-by: Dmitry Torokhov Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6cc046d3681..f42e4060b39 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -173,10 +173,9 @@ out: static void destroy_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - struct list_head *this; - struct fsync_inode_entry *entry; - list_for_each(this, head) { - entry = list_entry(this, struct fsync_inode_entry, list); + struct fsync_inode_entry *entry, *tmp; + + list_for_each_entry_safe(entry, tmp, head, list) { iput(entry->inode); list_del(&entry->list); kmem_cache_free(fsync_entry_slab, entry); -- cgit v1.2.3-70-g09d2 From 10b8c7dff5d3633b69e77f57d404dab54ead3787 Mon Sep 17 00:00:00 2001 From: Cong Ding Date: Tue, 22 Jan 2013 19:20:58 -0500 Subject: fs/cifs/cifs_dfs_ref.c: fix potential memory leakage When it goes to error through line 144, the memory allocated to *devname is not freed, and the caller doesn't free it either in line 250. So we free the memroy of *devname in function cifs_compose_mount_options() when it goes to error. Signed-off-by: Cong Ding CC: stable Reviewed-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index ce5cbd717bf..210fce2df30 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -226,6 +226,8 @@ compose_mount_options_out: compose_mount_options_err: kfree(mountdata); mountdata = ERR_PTR(rc); + kfree(*devname); + *devname = NULL; goto compose_mount_options_out; } -- cgit v1.2.3-70-g09d2 From 201a90389424d6771d24fc5d72f7e34cb4a8f967 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 24 Jan 2013 12:02:07 -0500 Subject: Btrfs: do not allow logged extents to be merged or removed We drop the extent map tree lock while we're logging extents, so somebody could come in and merge another extent into this one and screw up our logging, or they could even remove us from the list which would keep us from logging the extent or freeing our ref on it, so we need to make sure to not clear LOGGING until after the extent is logged, and then we can merge it to adjacent extents. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent_map.c | 13 ++++++++++++- fs/btrfs/extent_map.h | 1 + fs/btrfs/tree-log.c | 5 +++-- 3 files changed, 16 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index fff2c28497b..ed88f5ee4be 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -171,6 +171,10 @@ static int mergable_maps(struct extent_map *prev, struct extent_map *next) if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) return 0; + if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || + test_bit(EXTENT_FLAG_LOGGING, &next->flags)) + return 0; + if (extent_map_end(prev) == next->start && prev->flags == next->flags && prev->bdev == next->bdev && @@ -256,7 +260,8 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, if (!em) goto out; - list_move(&em->list, &tree->modified_extents); + if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + list_move(&em->list, &tree->modified_extents); em->generation = gen; clear_bit(EXTENT_FLAG_PINNED, &em->flags); em->mod_start = em->start; @@ -281,6 +286,12 @@ out: } +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) +{ + clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + try_merge_map(tree, em); +} + /** * add_extent_mapping - add new extent map to the extent tree * @tree: tree to insert new map in diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 922943ce29e..c6598c89cff 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -69,6 +69,7 @@ void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void extent_map_exit(void); int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 83186c7e45d..de8899b04d6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3410,13 +3410,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, em = list_entry(extents.next, struct extent_map, list); list_del_init(&em->list); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); /* * If we had an error we just need to delete everybody from our * private list. */ if (ret) { + clear_em_logging(tree, em); free_extent_map(em); continue; } @@ -3424,8 +3424,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, write_unlock(&tree->lock); ret = log_one_extent(trans, inode, root, em, path); - free_extent_map(em); write_lock(&tree->lock); + clear_em_logging(tree, em); + free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); -- cgit v1.2.3-70-g09d2 From b0175117b9376a69978bbe80af26fb95dddbd53e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 18 Dec 2012 11:39:19 -0500 Subject: Btrfs: fix panic when recovering tree log A user reported a BUG_ON(ret) that occured during tree log replay. Ret was -EAGAIN, so what I think happened is that we removed an extent that covered a bitmap entry and an extent entry. We remove the part from the bitmap and return -EAGAIN and then search for the next piece we want to remove, which happens to be an entire extent entry, so we just free the sucker and return. The problem is ret is still set to -EAGAIN so we trip the BUG_ON(). The user used btrfs-zero-log so I'm not 100% sure this is what happened so I've added a WARN_ON() to catch the other possibility. Thanks, Reported-by: Jan Steffens Signed-off-by: Josef Bacik --- fs/btrfs/free-space-cache.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 59ea2e4349c..0be7a8742a4 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1862,11 +1862,13 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, { struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; struct btrfs_free_space *info; - int ret = 0; + int ret; + bool re_search = false; spin_lock(&ctl->tree_lock); again: + ret = 0; if (!bytes) goto out_lock; @@ -1879,17 +1881,17 @@ again: info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset), 1, 0); if (!info) { - /* the tree logging code might be calling us before we - * have fully loaded the free space rbtree for this - * block group. So it is possible the entry won't - * be in the rbtree yet at all. The caching code - * will make sure not to put it in the rbtree if - * the logging code has pinned it. + /* + * If we found a partial bit of our free space in a + * bitmap but then couldn't find the other part this may + * be a problem, so WARN about it. */ + WARN_ON(re_search); goto out_lock; } } + re_search = false; if (!info->bitmap) { unlink_free_space(ctl, info); if (offset == info->offset) { @@ -1935,8 +1937,10 @@ again: } ret = remove_from_bitmap(ctl, info, &offset, &bytes); - if (ret == -EAGAIN) + if (ret == -EAGAIN) { + re_search = true; goto again; + } BUG_ON(ret); /* logic error */ out_lock: spin_unlock(&ctl->tree_lock); -- cgit v1.2.3-70-g09d2 From 192000dda22e02225772e862b92e7c09e5a17d08 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Sun, 6 Jan 2013 03:38:22 +0000 Subject: Btrfs: use right range to find checksum for compressed extents For compressed extents, the range of checksum is covered by disk length, and the disk length is different with ram length, so we need to use disk length instead to get us the right checksum. Signed-off-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/tree-log.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index de8899b04d6..9027bb1e746 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3357,6 +3357,11 @@ static int log_one_extent(struct btrfs_trans_handle *trans, if (skip_csum) return 0; + if (em->compress_type) { + csum_offset = 0; + csum_len = block_len; + } + /* block start is already adjusted for the file extent offset. */ ret = btrfs_lookup_csums_range(log->fs_info->csum_root, em->block_start + csum_offset, -- cgit v1.2.3-70-g09d2 From e58dd74bccb4317e39e4b675bf9c6cd133608fac Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 22 Jan 2013 15:43:09 -0500 Subject: Btrfs: put csums on the right ordered extent I noticed a WARN_ON going off when adding csums because we were going over the amount of csum bytes that should have been allowed for an ordered extent. This is a leftover from when we used to hold the csums privately for direct io, but now we use the normal ordered sum stuff so we need to make sure and check if we've moved on to another extent so that the csums are added to the right extent. Without this we could end up with csums for bytenrs that don't have extents to cover them yet. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/file-item.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index bd38cef4235..94aa53b3872 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -460,8 +460,8 @@ int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, if (!contig) offset = page_offset(bvec->bv_page) + bvec->bv_offset; - if (!contig && (offset >= ordered->file_offset + ordered->len || - offset < ordered->file_offset)) { + if (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset) { unsigned long bytes_left; sums->len = this_sum_bytes; this_sum_bytes = 0; -- cgit v1.2.3-70-g09d2 From 8d25a086eb104297e3ba1fdd180b04cfaaa84797 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:27:25 +0000 Subject: Btrfs: Add ACCESS_ONCE() to transaction->abort accesses We may access and update transaction->aborted on the different CPUs without lock, so we need ACCESS_ONCE() wrapper to prevent the compiler from creating unsolicited accesses and make sure we can get the right value. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/super.c | 2 +- fs/btrfs/transaction.c | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 99545df1b86..d8982e9601d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -267,7 +267,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, function, line, errstr); return; } - trans->transaction->aborted = errno; + ACCESS_ONCE(trans->transaction->aborted) = errno; __btrfs_std_error(root->fs_info, function, line, errno, NULL); } /* diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 87fac9a21ea..0ef29611fad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1468,7 +1468,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } - if (cur_trans->aborted) { + /* Stop the commit early if ->aborted is set */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { ret = cur_trans->aborted; goto cleanup_transaction; } -- cgit v1.2.3-70-g09d2 From 2cba30f172afdfa00f3e844f42f21eb3b972d01c Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 15 Jan 2013 06:29:12 +0000 Subject: Btrfs: fix missed transaction->aborted check First, though the current transaction->aborted check can stop the commit early and avoid unnecessary operations, it is too early, and some transaction handles don't end, those handles may set transaction->aborted after the check. Second, when we commit the transaction, we will wake up some worker threads to flush the space cache and inode cache. Those threads also allocate some transaction handles and may set transaction->aborted if some serious error happens. So we need more check for ->aborted when committing the transaction. Fix it. Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/transaction.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 0ef29611fad..f15494699f3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1575,6 +1575,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, wait_event(cur_trans->writer_wait, atomic_read(&cur_trans->num_writers) == 1); + /* ->aborted might be set after the previous check, so check it */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + goto cleanup_transaction; + } /* * the reloc mutex makes sure that we stop * the balancing code from coming in and moving @@ -1658,6 +1663,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto cleanup_transaction; } + /* + * The tasks which save the space cache and inode cache may also + * update ->aborted, check it. + */ + if (unlikely(ACCESS_ONCE(cur_trans->aborted))) { + ret = cur_trans->aborted; + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto cleanup_transaction; + } + btrfs_prepare_extent_commit(trans, root); cur_trans = root->fs_info->running_transaction; -- cgit v1.2.3-70-g09d2 From c9f01bfe0ca411b4751d7fdbb9d602035ba52f75 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 16 Jan 2013 11:27:17 +0000 Subject: Btrfs: fix wrong max device number for single profile The max device number of single profile is 1, not 0 (0 means 'as many as possible'). Fix it. Cc: Liu Bo Signed-off-by: Miao Xie Reviewed-by: Liu Bo Signed-off-by: Josef Bacik --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 46960983891..15f6efdf646 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3507,7 +3507,7 @@ struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { { 1, 1, 2, 2, 2, 2 /* raid1 */ }, { 1, 2, 1, 1, 1, 2 /* dup */ }, { 1, 1, 0, 2, 1, 1 /* raid0 */ }, - { 1, 1, 0, 1, 1, 1 /* single */ }, + { 1, 1, 1, 1, 1, 1 /* single */ }, }; static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, -- cgit v1.2.3-70-g09d2 From 1eafa6c73791e4f312324ddad9cbcaf6a1b6052b Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Tue, 22 Jan 2013 10:49:00 +0000 Subject: Btrfs: fix repeated delalloc work allocation btrfs_start_delalloc_inodes() locks the delalloc_inodes list, fetches the first inode, unlocks the list, triggers btrfs_alloc_delalloc_work/ btrfs_queue_worker for this inode, and then it locks the list, checks the head of the list again. But because we don't delete the first inode that it deals with before, it will fetch the same inode. As a result, this function allocates a huge amount of btrfs_delalloc_work structures, and OOM happens. Fix this problem by splice this delalloc list. Reported-by: Alex Lyakas Signed-off-by: Miao Xie Signed-off-by: Josef Bacik --- fs/btrfs/inode.c | 55 +++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 41 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9bc6c40b182..ca7ace7b7b5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7585,41 +7585,61 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) */ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { - struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; struct inode *inode; struct btrfs_delalloc_work *work, *next; struct list_head works; + struct list_head splice; int ret = 0; if (root->fs_info->sb->s_flags & MS_RDONLY) return -EROFS; INIT_LIST_HEAD(&works); - + INIT_LIST_HEAD(&splice); +again: spin_lock(&root->fs_info->delalloc_lock); - while (!list_empty(head)) { - binode = list_entry(head->next, struct btrfs_inode, + list_splice_init(&root->fs_info->delalloc_inodes, &splice); + while (!list_empty(&splice)) { + binode = list_entry(splice.next, struct btrfs_inode, delalloc_inodes); + + list_del_init(&binode->delalloc_inodes); + inode = igrab(&binode->vfs_inode); if (!inode) - list_del_init(&binode->delalloc_inodes); + continue; + + list_add_tail(&binode->delalloc_inodes, + &root->fs_info->delalloc_inodes); spin_unlock(&root->fs_info->delalloc_lock); - if (inode) { - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (!work) { - ret = -ENOMEM; - goto out; - } - list_add_tail(&work->list, &works); - btrfs_queue_worker(&root->fs_info->flush_workers, - &work->work); + + work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + if (unlikely(!work)) { + ret = -ENOMEM; + goto out; } + list_add_tail(&work->list, &works); + btrfs_queue_worker(&root->fs_info->flush_workers, + &work->work); + cond_resched(); spin_lock(&root->fs_info->delalloc_lock); } spin_unlock(&root->fs_info->delalloc_lock); + list_for_each_entry_safe(work, next, &works, list) { + list_del_init(&work->list); + btrfs_wait_and_free_delalloc_work(work); + } + + spin_lock(&root->fs_info->delalloc_lock); + if (!list_empty(&root->fs_info->delalloc_inodes)) { + spin_unlock(&root->fs_info->delalloc_lock); + goto again; + } + spin_unlock(&root->fs_info->delalloc_lock); + /* the filemap_flush will queue IO into the worker threads, but * we have to make sure the IO is actually started and that * ordered extents get created before we return @@ -7632,11 +7652,18 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) atomic_read(&root->fs_info->async_delalloc_pages) == 0)); } atomic_dec(&root->fs_info->async_submit_draining); + return 0; out: list_for_each_entry_safe(work, next, &works, list) { list_del_init(&work->list); btrfs_wait_and_free_delalloc_work(work); } + + if (!list_empty_careful(&splice)) { + spin_lock(&root->fs_info->delalloc_lock); + list_splice_tail(&splice, &root->fs_info->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + } return ret; } -- cgit v1.2.3-70-g09d2 From dee972b967ae111ad5705733de17a3bfc4632311 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 16 Jan 2013 15:05:44 -0500 Subject: NFS: Fix error reporting in nfs_xdev_mount Currently, nfs_xdev_mount converts all errors from clone_server() to ENOMEM, which can then leak to userspace (for instance to 'mount'). Fix that. Also ensure that if nfs_fs_mount_common() returns an error, we don't dprintk(0)... The regression originated in commit 3d176e3fe4f6dc379b252bf43e2e146a8f7caf01 (NFS: Use nfs_fs_mount_common() for xdev mounts) Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>= 3.5] --- fs/nfs/super.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 2e7e8c878e5..b056b162872 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2589,27 +2589,23 @@ nfs_xdev_mount(struct file_system_type *fs_type, int flags, struct nfs_server *server; struct dentry *mntroot = ERR_PTR(-ENOMEM); struct nfs_subversion *nfs_mod = NFS_SB(data->sb)->nfs_client->cl_nfs_mod; - int error; - dprintk("--> nfs_xdev_mount_common()\n"); + dprintk("--> nfs_xdev_mount()\n"); mount_info.mntfh = mount_info.cloned->fh; /* create a new volume representation */ server = nfs_mod->rpc_ops->clone_server(NFS_SB(data->sb), data->fh, data->fattr, data->authflavor); - if (IS_ERR(server)) { - error = PTR_ERR(server); - goto out_err; - } - mntroot = nfs_fs_mount_common(server, flags, dev_name, &mount_info, nfs_mod); - dprintk("<-- nfs_xdev_mount_common() = 0\n"); -out: - return mntroot; + if (IS_ERR(server)) + mntroot = ERR_CAST(server); + else + mntroot = nfs_fs_mount_common(server, flags, + dev_name, &mount_info, nfs_mod); -out_err: - dprintk("<-- nfs_xdev_mount_common() = %d [error]\n", error); - goto out; + dprintk("<-- nfs_xdev_mount() = %ld\n", + IS_ERR(mntroot) ? PTR_ERR(mntroot) : 0L); + return mntroot; } #if IS_ENABLED(CONFIG_NFS_V4) -- cgit v1.2.3-70-g09d2 From 4ae19c2dd713edb7b8ad3d4ab9d234ed5dcb6b98 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Jan 2013 22:41:53 -0500 Subject: NFSv4: Fix NFSv4 reference counting for trunked sessions The reference counting in nfs4_init_client assumes wongly that it is safe for nfs4_discover_server_trunking() to return a pointer to a nfs_client prior to bumping the reference count. Signed-off-by: Trond Myklebust Cc: Chuck Lever Cc: Ben Greear Cc: stable@vger.kernel.org [>=3.7] --- fs/nfs/nfs4client.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index acc34726812..65a290a7306 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -236,11 +236,10 @@ struct nfs_client *nfs4_init_client(struct nfs_client *clp, error = nfs4_discover_server_trunking(clp, &old); if (error < 0) goto error; + nfs_put_client(clp); if (clp != old) { clp->cl_preserve_clid = true; - nfs_put_client(clp); clp = old; - atomic_inc(&clp->cl_count); } return clp; @@ -306,7 +305,7 @@ int nfs40_walk_client_list(struct nfs_client *new, .clientid = new->cl_clientid, .confirm = new->cl_confirm, }; - int status; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -332,28 +331,28 @@ int nfs40_walk_client_list(struct nfs_client *new, if (prev) nfs_put_client(prev); + prev = pos; status = nfs4_proc_setclientid_confirm(pos, &clid, cred); - if (status == 0) { + switch (status) { + case -NFS4ERR_STALE_CLIENTID: + break; + case 0: nfs4_swap_callback_idents(pos, new); - nfs_put_client(pos); + prev = NULL; *result = pos; dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); - return 0; - } - if (status != -NFS4ERR_STALE_CLIENTID) { - nfs_put_client(pos); - dprintk("NFS: <-- %s status = %d, no result\n", - __func__, status); - return status; + default: + goto out; } spin_lock(&nn->nfs_client_lock); - prev = pos; } + spin_unlock(&nn->nfs_client_lock); +out: /* * No matching nfs_client found. This should be impossible, * because the new nfs_client has already been added to @@ -363,9 +362,8 @@ int nfs40_walk_client_list(struct nfs_client *new, */ if (prev) nfs_put_client(prev); - spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #ifdef CONFIG_NFS_V4_1 @@ -473,6 +471,7 @@ int nfs41_walk_client_list(struct nfs_client *new, if (!nfs4_match_serverowners(pos, new)) continue; + atomic_inc(&pos->cl_count); spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s using nfs_client = %p ({%d})\n", __func__, pos, atomic_read(&pos->cl_count)); -- cgit v1.2.3-70-g09d2 From 202c312dba7d95b96493b412c606163a0cd83984 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Jan 2013 22:56:23 -0500 Subject: NFSv4: Fix NFSv4 trunking discovery If walking the list in nfs4[01]_walk_client_list fails, then the most likely explanation is that the server dropped the clientid before we actually managed to confirm it. As long as our nfs_client is the very last one in the list to be tested, the caller can be assured that this is the case when the final return value is NFS4ERR_STALE_CLIENTID. Reported-by: Ben Greear Signed-off-by: Trond Myklebust Cc: Chuck Lever Cc: stable@vger.kernel.org [>=3.7] Tested-by: Ben Greear --- fs/nfs/nfs4client.c | 26 +++++++------------------- fs/nfs/nfs4state.c | 8 ++------ 2 files changed, 9 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 65a290a7306..2f21f17fb16 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -352,14 +352,8 @@ int nfs40_walk_client_list(struct nfs_client *new, } spin_unlock(&nn->nfs_client_lock); + /* No match found. The server lost our clientid */ out: - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ if (prev) nfs_put_client(prev); dprintk("NFS: <-- %s status = %d\n", __func__, status); @@ -430,7 +424,7 @@ int nfs41_walk_client_list(struct nfs_client *new, { struct nfs_net *nn = net_generic(new->cl_net, nfs_net_id); struct nfs_client *pos, *n, *prev = NULL; - int error; + int status = -NFS4ERR_STALE_CLIENTID; spin_lock(&nn->nfs_client_lock); list_for_each_entry_safe(pos, n, &nn->nfs_client_list, cl_share_link) { @@ -446,8 +440,8 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; - error = nfs_wait_client_init_complete(pos); - if (error < 0) { + status = nfs_wait_client_init_complete(pos); + if (status < 0) { nfs_put_client(pos); spin_lock(&nn->nfs_client_lock); continue; @@ -480,16 +474,10 @@ int nfs41_walk_client_list(struct nfs_client *new, return 0; } - /* - * No matching nfs_client found. This should be impossible, - * because the new nfs_client has already been added to - * nfs_client_list by nfs_get_client(). - * - * Don't BUG(), since the caller is holding a mutex. - */ + /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); - pr_err("NFS: %s Error: no matching nfs_client found\n", __func__); - return -NFS4ERR_STALE_CLIENTID; + dprintk("NFS: <-- %s status = %d\n", __func__, status); + return status; } #endif /* CONFIG_NFS_V4_1 */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 9448c579d41..f72561ca689 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -136,16 +136,11 @@ int nfs40_discover_server_trunking(struct nfs_client *clp, clp->cl_confirm = clid.confirm; status = nfs40_walk_client_list(clp, result, cred); - switch (status) { - case -NFS4ERR_STALE_CLIENTID: - set_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - case 0: + if (status == 0) { /* Sustain the lease, even if it's empty. If the clientid4 * goes stale it's of no use for trunking discovery. */ nfs4_schedule_state_renewal(*result); - break; } - out: return status; } @@ -1863,6 +1858,7 @@ again: case -ETIMEDOUT: case -EAGAIN: ssleep(1); + case -NFS4ERR_STALE_CLIENTID: dprintk("NFS: %s after status %d, retrying\n", __func__, status); goto again; -- cgit v1.2.3-70-g09d2 From 65436ec0c8e344d9b23302b686e418f2a7b7cf7b Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 18 Jan 2013 23:01:43 -0500 Subject: NFSv4.1: Ensure that nfs41_walk_client_list() does start lease recovery We do need to start the lease recovery thread prior to waiting for the client initialisation to complete in NFSv4.1. Signed-off-by: Trond Myklebust Cc: Chuck Lever Cc: Ben Greear Cc: stable@vger.kernel.org [>=3.7] --- fs/nfs/nfs4client.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 2f21f17fb16..2e9779b58b7 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -440,14 +440,17 @@ int nfs41_walk_client_list(struct nfs_client *new, nfs_put_client(prev); prev = pos; + nfs4_schedule_lease_recovery(pos); status = nfs_wait_client_init_complete(pos); if (status < 0) { nfs_put_client(pos); spin_lock(&nn->nfs_client_lock); continue; } - + status = pos->cl_cons_state; spin_lock(&nn->nfs_client_lock); + if (status < 0) + continue; } if (pos->rpc_ops != new->rpc_ops) -- cgit v1.2.3-70-g09d2 From d4e0bfec9b6fbb9b58640b44e01bb74ae0d29b22 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Thu, 3 Jan 2013 17:52:07 -0500 Subject: GFS2: fix skip unlock condition The recent commit fb6791d100d1bba20b5cdbc4912e1f7086ec60f8 included the wrong logic. The lvbptr check was incorrectly added after the patch was tested. Signed-off-by: David Teigland Signed-off-by: Steven Whitehouse --- fs/gfs2/lock_dlm.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index b906ed17a83..9802de0f85e 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -281,6 +281,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + int lvb_needs_unlock = 0; int error; if (gl->gl_lksb.sb_lkid == 0) { @@ -294,8 +295,12 @@ static void gdlm_put_lock(struct gfs2_glock *gl) gfs2_update_request_times(gl); /* don't want to skip dlm_unlock writing the lvb when lock is ex */ + + if (gl->gl_lksb.sb_lvbptr && (gl->gl_state == LM_ST_EXCLUSIVE)) + lvb_needs_unlock = 1; + if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && - gl->gl_lksb.sb_lvbptr && (gl->gl_state != LM_ST_EXCLUSIVE)) { + !lvb_needs_unlock) { gfs2_glock_free(gl); return; } -- cgit v1.2.3-70-g09d2 From 1bee12b8c44d825fb45cd6a13e76c185ed6888b8 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 16 Jan 2013 17:33:53 -0600 Subject: xfs: Do not return EFSCORRUPTED when filesystem probe finds no XFS magic 9802182 changed the return value from EWRONGFS (aka EINVAL) to EFSCORRUPTED which doesn't seem to be handled properly by the root filesystem probe. Signed-off-by: Eric Sandeen Tested-by: Sergei Trofimovich Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_mount.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index da508463ff1..7d6df7c00c3 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -658,7 +658,7 @@ xfs_sb_quiet_read_verify( return; } /* quietly fail */ - xfs_buf_ioerror(bp, EFSCORRUPTED); + xfs_buf_ioerror(bp, EWRONGFS); } static void -- cgit v1.2.3-70-g09d2 From d26978dd866dbb3b3a9690f3655a5e735055de89 Mon Sep 17 00:00:00 2001 From: Brian Foster Date: Thu, 17 Jan 2013 13:11:29 -0500 Subject: xfs: pull up stack_switch check into xfs_bmapi_write The stack_switch check currently occurs in __xfs_bmapi_allocate, which means the stack switch only occurs when xfs_bmapi_allocate() is called in a loop. Pull the check up before the loop in xfs_bmapi_write() such that the first iteration of the loop has consistent behavior. Signed-off-by: Brian Foster Reviewed-by: Dave Chinner Signed-off-by: Ben Myers --- fs/xfs/xfs_bmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 0e92d12765d..cdb2d334858 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4680,9 +4680,6 @@ __xfs_bmapi_allocate( return error; } - if (bma->flags & XFS_BMAPI_STACK_SWITCH) - bma->stack_switch = 1; - error = xfs_bmap_alloc(bma); if (error) return error; @@ -4956,6 +4953,9 @@ xfs_bmapi_write( bma.flist = flist; bma.firstblock = firstblock; + if (flags & XFS_BMAPI_STACK_SWITCH) + bma.stack_switch = 1; + while (bno < end && n < *nmap) { inhole = eof || bma.got.br_startoff > bno; wasdelay = !inhole && isnullstartblock(bma.got.br_startblock); -- cgit v1.2.3-70-g09d2 From eb178619f930fa2ba2348de332a1ff1c66a31424 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:52 +1100 Subject: xfs: fix _xfs_buf_find oops on blocks beyond the filesystem end When _xfs_buf_find is passed an out of range address, it will fail to find a relevant struct xfs_perag and oops with a null dereference. This can happen when trying to walk a filesystem with a metadata inode that has a partially corrupted extent map (i.e. the block number returned is corrupt, but is otherwise intact) and we try to read from the corrupted block address. In this case, just fail the lookup. If it is readahead being issued, it will simply not be done, but if it is real read that fails we will get an error being reported. Ideally this case should result in an EFSCORRUPTED error being reported, but we cannot return an error through xfs_buf_read() or xfs_buf_get() so this lookup failure may result in ENOMEM or EIO errors being reported instead. Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 56d1614760c..689d72655ea 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -487,6 +487,7 @@ _xfs_buf_find( struct rb_node *parent; xfs_buf_t *bp; xfs_daddr_t blkno = map[0].bm_bn; + xfs_daddr_t eofs; int numblks = 0; int i; @@ -498,6 +499,23 @@ _xfs_buf_find( ASSERT(!(numbytes < (1 << btp->bt_sshift))); ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_smask)); + /* + * Corrupted block numbers can get through to here, unfortunately, so we + * have to check that the buffer falls within the filesystem bounds. + */ + eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks); + if (blkno >= eofs) { + /* + * XXX (dgc): we should really be returning EFSCORRUPTED here, + * but none of the higher level infrastructure supports + * returning a specific error on buffer lookup failures. + */ + xfs_alert(btp->bt_mount, + "%s: Block out of range: block 0x%llx, EOFS 0x%llx ", + __func__, blkno, eofs); + return NULL; + } + /* get tree root */ pag = xfs_perag_get(btp->bt_mount, xfs_daddr_to_agno(btp->bt_mount, blkno)); -- cgit v1.2.3-70-g09d2 From f2a459565b02b60408f3f2e5ca992a031319712b Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:54 +1100 Subject: xfs: limit speculative prealloc near ENOSPC thresholds There is a window on small filesytsems where specualtive preallocation can be larger than that ENOSPC throttling thresholds, resulting in specualtive preallocation trying to reserve more space than there is space available. This causes immediate ENOSPC to be triggered, prealloc to be turned off and flushing to occur. One the next write (i.e. next 4k page), we do exactly the same thing, and so effective drive into synchronous 4k writes by triggering ENOSPC flushing on every page while in the window between the prealloc size and the ENOSPC prealloc throttle threshold. Fix this by checking to see if the prealloc size would consume all free space, and throttle it appropriately to avoid premature ENOSPC... Signed-off-by: Dave Chinner Reviewed-by: Brian Foster Signed-off-by: Ben Myers --- fs/xfs/xfs_iomap.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index add06b4e9a6..364818eef40 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -351,6 +351,15 @@ xfs_iomap_prealloc_size( } if (shift) alloc_blocks >>= shift; + + /* + * If we are still trying to allocate more space than is + * available, squash the prealloc hard. This can happen if we + * have a large file on a small filesystem and the above + * lowspace thresholds are smaller than MAXEXTLEN. + */ + while (alloc_blocks >= freesp) + alloc_blocks >>= 4; } if (alloc_blocks < mp->m_writeio_blocks) -- cgit v1.2.3-70-g09d2 From 9f87832a82923943aaab38b8d53658af134bbfa4 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Mon, 21 Jan 2013 23:53:55 +1100 Subject: xfs: fix shutdown hang on invalid inode during create When the new inode verify in xfs_iread() fails, the create transaction is aborted and a shutdown occurs. The subsequent unmount then hangs in xfs_wait_buftarg() on a buffer that has an elevated hold count. Debug showed that it was an AGI buffer getting stuck: [ 22.576147] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 22.976213] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 23.376206] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck [ 23.776325] XFS (vdb): buffer 0x2/0x1, hold 0x2 stuck The trace of this buffer leading up to the shutdown (trimmed for brevity) looks like: xfs_buf_init: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_get_map xfs_buf_get: bno 0x2 len 0x200 hold 1 caller xfs_buf_read_map xfs_buf_read: bno 0x2 len 0x200 hold 1 caller xfs_trans_read_buf_map xfs_buf_iorequest: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_hold: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_iorequest xfs_buf_rele: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_iorequest xfs_buf_iowait: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_ioerror: bno 0x2 len 0x200 hold 1 caller xfs_buf_bio_end_io xfs_buf_iodone: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_ioend xfs_buf_iowait_done: bno 0x2 nblks 0x1 hold 1 caller _xfs_buf_read xfs_buf_hold: bno 0x2 nblks 0x1 hold 1 caller xfs_buf_item_init xfs_trans_read_buf: bno 0x2 len 0x200 hold 2 recur 0 refcount 1 xfs_trans_brelse: bno 0x2 len 0x200 hold 2 recur 0 refcount 1 xfs_buf_item_relse: bno 0x2 nblks 0x1 hold 2 caller xfs_trans_brelse xfs_buf_rele: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_item_relse xfs_buf_unlock: bno 0x2 nblks 0x1 hold 1 caller xfs_trans_brelse xfs_buf_rele: bno 0x2 nblks 0x1 hold 1 caller xfs_trans_brelse xfs_buf_trylock: bno 0x2 nblks 0x1 hold 2 caller _xfs_buf_find xfs_buf_find: bno 0x2 len 0x200 hold 2 caller xfs_buf_get_map xfs_buf_get: bno 0x2 len 0x200 hold 2 caller xfs_buf_read_map xfs_buf_read: bno 0x2 len 0x200 hold 2 caller xfs_trans_read_buf_map xfs_buf_hold: bno 0x2 nblks 0x1 hold 2 caller xfs_buf_item_init xfs_trans_read_buf: bno 0x2 len 0x200 hold 3 recur 0 refcount 1 xfs_trans_log_buf: bno 0x2 len 0x200 hold 3 recur 0 refcount 1 xfs_buf_item_unlock: bno 0x2 len 0x200 hold 3 flags DIRTY liflags ABORTED xfs_buf_unlock: bno 0x2 nblks 0x1 hold 3 caller xfs_buf_item_unlock xfs_buf_rele: bno 0x2 nblks 0x1 hold 3 caller xfs_buf_item_unlock And that is the AGI buffer from cold cache read into memory to transaction abort. You can see at transaction abort the bli is dirty and only has a single reference. The item is not pinned, and it's not in the AIL. Hence the only reference to it is this transaction. The problem is that the xfs_buf_item_unlock() call is dropping the last reference to the xfs_buf_log_item attached to the buffer (which holds a reference to the buffer), but it is not freeing the xfs_buf_log_item. Hence nothing will ever release the buffer, and the unmount hangs waiting for this reference to go away. The fix is simple - xfs_buf_item_unlock needs to detect the last reference going away in this case and free the xfs_buf_log_item to release the reference it holds on the buffer. Signed-off-by: Dave Chinner Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_buf.c | 2 ++ fs/xfs/xfs_buf_item.c | 12 ++++++++++-- fs/xfs/xfs_trace.h | 1 + 3 files changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 689d72655ea..fbbb9eb92e3 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1505,6 +1505,8 @@ restart: while (!list_empty(&btp->bt_lru)) { bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru); if (atomic_read(&bp->b_hold) > 1) { + trace_xfs_buf_wait_buftarg(bp, _RET_IP_); + list_move_tail(&bp->b_lru, &btp->bt_lru); spin_unlock(&btp->bt_lru_lock); delay(100); goto restart; diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 77b09750e92..3f9949fee39 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -652,7 +652,10 @@ xfs_buf_item_unlock( /* * If the buf item isn't tracking any data, free it, otherwise drop the - * reference we hold to it. + * reference we hold to it. If we are aborting the transaction, this may + * be the only reference to the buf item, so we free it anyway + * regardless of whether it is dirty or not. A dirty abort implies a + * shutdown, anyway. */ clean = 1; for (i = 0; i < bip->bli_format_count; i++) { @@ -664,7 +667,12 @@ xfs_buf_item_unlock( } if (clean) xfs_buf_item_relse(bp); - else + else if (aborted) { + if (atomic_dec_and_test(&bip->bli_refcount)) { + ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp)); + xfs_buf_item_relse(bp); + } + } else atomic_dec(&bip->bli_refcount); if (!hold) diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 2e137d4a85a..16a812977ea 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -341,6 +341,7 @@ DEFINE_BUF_EVENT(xfs_buf_item_relse); DEFINE_BUF_EVENT(xfs_buf_item_iodone); DEFINE_BUF_EVENT(xfs_buf_item_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); +DEFINE_BUF_EVENT(xfs_buf_wait_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_io); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); -- cgit v1.2.3-70-g09d2 From 4b05d09c18d9aa62d2e7fb4b057f54e5a38963f5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 23 Jan 2013 13:56:18 +0100 Subject: xfs: Fix possible use-after-free with AIO Running AIO is pinning inode in memory using file reference. Once AIO is completed using aio_complete(), file reference is put and inode can be freed from memory. So we have to be sure that calling aio_complete() is the last thing we do with the inode. CC: xfs@oss.sgi.com CC: Ben Myers CC: stable@vger.kernel.org Signed-off-by: Jan Kara Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4111a40ebe1..5f707e53717 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -86,11 +86,11 @@ xfs_destroy_ioend( } if (ioend->io_iocb) { + inode_dio_done(ioend->io_inode); if (ioend->io_isasync) { aio_complete(ioend->io_iocb, ioend->io_error ? ioend->io_error : ioend->io_result, 0); } - inode_dio_done(ioend->io_inode); } mempool_free(ioend, xfs_ioend_pool); -- cgit v1.2.3-70-g09d2 From 65e3aa77f1b0269720660a6879f6f28d158f54c8 Mon Sep 17 00:00:00 2001 From: Torsten Kaiser Date: Sun, 20 Jan 2013 10:24:49 +0100 Subject: xfs: Fix xfs_swap_extents() after removal of xfs_flushinval_pages() Commit fb59581404ab7ec5075299065c22cb211a9262a9 removed xfs_flushinval_pages() and changed its callers to use filemap_write_and_wait() and truncate_pagecache_range() directly. But in xfs_swap_extents() this change accidental switched the argument for 'tip' to 'ip'. This patch switches it back to 'tip' Signed-off-by: Torsten Kaiser Reviewed-by: Ben Myers Signed-off-by: Ben Myers --- fs/xfs/xfs_dfrag.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d0e9c74d3d9..a8bd26b82ec 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -246,10 +246,10 @@ xfs_swap_extents( goto out_unlock; } - error = -filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = -filemap_write_and_wait(VFS_I(tip)->i_mapping); if (error) goto out_unlock; - truncate_pagecache_range(VFS_I(ip), 0, -1); + truncate_pagecache_range(VFS_I(tip), 0, -1); /* Verify O_DIRECT for ftmp */ if (VN_CACHED(VFS_I(tip)) != 0) { -- cgit v1.2.3-70-g09d2 From ab225417825963b6dc66be7ea80f94ac1378dfdf Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 22 Jan 2013 00:17:06 -0500 Subject: NFS: Don't silently fail setattr() requests on mountpoints Ensure that any setattr and getattr requests for junctions and/or mountpoints are sent to the server. Ever since commit 0ec26fd0698 (vfs: automount should ignore LOOKUP_FOLLOW), we have silently dropped any setattr requests to a server-side mountpoint. For referrals, we have silently dropped both getattr and setattr requests. This patch restores the original behaviour for setattr on mountpoints, and tries to do the same for referrals, provided that we have a filehandle... Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/namespace.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) (limited to 'fs') diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index dd057bc6b65..fc8dc20fdeb 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -177,11 +177,31 @@ out_nofree: return mnt; } +static int +nfs_namespace_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_getattr(mnt, dentry, stat); + generic_fillattr(dentry->d_inode, stat); + return 0; +} + +static int +nfs_namespace_setattr(struct dentry *dentry, struct iattr *attr) +{ + if (NFS_FH(dentry->d_inode)->size != 0) + return nfs_setattr(dentry, attr); + return -EACCES; +} + const struct inode_operations nfs_mountpoint_inode_operations = { .getattr = nfs_getattr, + .setattr = nfs_setattr, }; const struct inode_operations nfs_referral_inode_operations = { + .getattr = nfs_namespace_getattr, + .setattr = nfs_namespace_setattr, }; static void nfs_expire_automounts(struct work_struct *work) -- cgit v1.2.3-70-g09d2 From c489ee290bdbbace6bb63ebe6ebd4dd605819495 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 30 Jan 2013 13:04:10 -0500 Subject: NFSv4.1: Handle NFS4ERR_DELAY when resetting the NFSv4.1 session NFS4ERR_DELAY is a legal reply when we call DESTROY_SESSION. It usually means that the server is busy handling an unfinished RPC request. Just sleep for a second and then retry. We also need to be able to handle the NFS4ERR_BACK_CHAN_BUSY return value. If the NFS server has outstanding callbacks, we just want to similarly sleep & retry. Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org --- fs/nfs/nfs4state.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index f72561ca689..e61f68d5ef2 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2018,8 +2018,18 @@ static int nfs4_reset_session(struct nfs_client *clp) nfs4_begin_drain_session(clp); cred = nfs4_get_exchange_id_cred(clp); status = nfs4_proc_destroy_session(clp->cl_session, cred); - if (status && status != -NFS4ERR_BADSESSION && - status != -NFS4ERR_DEADSESSION) { + switch (status) { + case 0: + case -NFS4ERR_BADSESSION: + case -NFS4ERR_DEADSESSION: + break; + case -NFS4ERR_BACK_CHAN_BUSY: + case -NFS4ERR_DELAY: + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + status = 0; + ssleep(1); + goto out; + default: status = nfs4_recovery_handle_error(clp, status); goto out; } -- cgit v1.2.3-70-g09d2