diff options
Diffstat (limited to 'fs')
178 files changed, 3959 insertions, 2991 deletions
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 18f74ec4dce..9d03d1ebca6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1001,44 +1001,6 @@ done: } /** - * v9fs_vfs_readlink - read a symlink's location - * @dentry: dentry for symlink - * @buffer: buffer to load symlink location into - * @buflen: length of buffer - * - */ - -static int v9fs_vfs_readlink(struct dentry *dentry, char __user * buffer, - int buflen) -{ - int retval; - int ret; - char *link = __getname(); - - if (unlikely(!link)) - return -ENOMEM; - - if (buflen > PATH_MAX) - buflen = PATH_MAX; - - P9_DPRINTK(P9_DEBUG_VFS, " dentry: %s (%p)\n", dentry->d_name.name, - dentry); - - retval = v9fs_readlink(dentry, link, buflen); - - if (retval > 0) { - if ((ret = copy_to_user(buffer, link, retval)) != 0) { - P9_DPRINTK(P9_DEBUG_ERROR, - "problem copying to user: %d\n", ret); - retval = ret; - } - } - - __putname(link); - return retval; -} - -/** * v9fs_vfs_follow_link - follow a symlink path * @dentry: dentry for symlink * @nd: nameidata @@ -1230,7 +1192,6 @@ static const struct inode_operations v9fs_dir_inode_operations_ext = { .rmdir = v9fs_vfs_rmdir, .mknod = v9fs_vfs_mknod, .rename = v9fs_vfs_rename, - .readlink = v9fs_vfs_readlink, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; @@ -1253,7 +1214,7 @@ static const struct inode_operations v9fs_file_inode_operations = { }; static const struct inode_operations v9fs_symlink_inode_operations = { - .readlink = v9fs_vfs_readlink, + .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, .put_link = v9fs_vfs_put_link, .getattr = v9fs_vfs_getattr, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 14a86448572..69357c0d989 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -188,7 +188,8 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, " %p\n", s); - v9fs_dentry_release(s->s_root); /* clunk root */ + if (s->s_root) + v9fs_dentry_release(s->s_root); /* clunk root */ kill_anon_super(s); diff --git a/fs/Kconfig b/fs/Kconfig index f8fccaaad62..64d44efad7a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -6,10 +6,6 @@ menu "File systems" if BLOCK -config FS_JOURNAL_INFO - bool - default n - source "fs/ext2/Kconfig" source "fs/ext3/Kconfig" source "fs/ext4/Kconfig" diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e511dc621a2..0e40caaba45 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -106,8 +106,8 @@ struct affs_sb_info { u32 s_last_bmap; struct buffer_head *s_bmap_bh; char *s_prefix; /* Prefix for volumes and assigns. */ - int s_prefix_len; /* Length of prefix. */ char s_volume[32]; /* Volume prefix for absolute symlinks. */ + spinlock_t symlink_lock; /* protects the previous two */ }; #define SF_INTL 0x0001 /* International filesystem. */ diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 960d336ec69..d70bbbac6b7 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -341,10 +341,13 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) p = (char *)AFFS_HEAD(bh)->table; lc = '/'; if (*symname == '/') { + struct affs_sb_info *sbi = AFFS_SB(sb); while (*symname == '/') symname++; - while (AFFS_SB(sb)->s_volume[i]) /* Cannot overflow */ - *p++ = AFFS_SB(sb)->s_volume[i++]; + spin_lock(&sbi->symlink_lock); + while (sbi->s_volume[i]) /* Cannot overflow */ + *p++ = sbi->s_volume[i++]; + spin_unlock(&sbi->symlink_lock); } while (i < maxlen && (c = *symname++)) { if (c == '.' && lc == '/' && *symname == '.' && symname[1] == '/') { diff --git a/fs/affs/super.c b/fs/affs/super.c index 104fdcb3a7f..d41e9673cd9 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -203,7 +203,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s switch (token) { case Opt_bs: if (match_int(&args[0], &n)) - return -EINVAL; + return 0; if (n != 512 && n != 1024 && n != 2048 && n != 4096) { printk ("AFFS: Invalid blocksize (512, 1024, 2048, 4096 allowed)\n"); @@ -213,7 +213,7 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_mode: if (match_octal(&args[0], &option)) - return 1; + return 0; *mode = option & 0777; *mount_opts |= SF_SETMODE; break; @@ -221,8 +221,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s *mount_opts |= SF_MUFS; break; case Opt_prefix: - /* Free any previous prefix */ - kfree(*prefix); *prefix = match_strdup(&args[0]); if (!*prefix) return 0; @@ -233,21 +231,21 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_reserved: if (match_int(&args[0], reserved)) - return 1; + return 0; break; case Opt_root: if (match_int(&args[0], root)) - return 1; + return 0; break; case Opt_setgid: if (match_int(&args[0], &option)) - return 1; + return 0; *gid = option; *mount_opts |= SF_SETGID; break; case Opt_setuid: if (match_int(&args[0], &option)) - return -EINVAL; + return 0; *uid = option; *mount_opts |= SF_SETUID; break; @@ -311,11 +309,14 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) return -ENOMEM; sb->s_fs_info = sbi; mutex_init(&sbi->s_bmlock); + spin_lock_init(&sbi->symlink_lock); if (!parse_options(data,&uid,&gid,&i,&reserved,&root_block, &blocksize,&sbi->s_prefix, sbi->s_volume, &mount_flags)) { printk(KERN_ERR "AFFS: Error parsing options\n"); + kfree(sbi->s_prefix); + kfree(sbi); return -EINVAL; } /* N.B. after this point s_prefix must be released */ @@ -516,14 +517,18 @@ affs_remount(struct super_block *sb, int *flags, char *data) unsigned long mount_flags; int res = 0; char *new_opts = kstrdup(data, GFP_KERNEL); + char volume[32]; + char *prefix = NULL; pr_debug("AFFS: remount(flags=0x%x,opts=\"%s\")\n",*flags,data); *flags |= MS_NODIRATIME; + memcpy(volume, sbi->s_volume, 32); if (!parse_options(data, &uid, &gid, &mode, &reserved, &root_block, - &blocksize, &sbi->s_prefix, sbi->s_volume, + &blocksize, &prefix, volume, &mount_flags)) { + kfree(prefix); kfree(new_opts); return -EINVAL; } @@ -534,6 +539,14 @@ affs_remount(struct super_block *sb, int *flags, char *data) sbi->s_mode = mode; sbi->s_uid = uid; sbi->s_gid = gid; + /* protect against readers */ + spin_lock(&sbi->symlink_lock); + if (prefix) { + kfree(sbi->s_prefix); + sbi->s_prefix = prefix; + } + memcpy(sbi->s_volume, volume, 32); + spin_unlock(&sbi->symlink_lock); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { unlock_kernel(); diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index 41782539c90..ee00f08c4f5 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -20,7 +20,6 @@ static int affs_symlink_readpage(struct file *file, struct page *page) int i, j; char c; char lc; - char *pf; pr_debug("AFFS: follow_link(ino=%lu)\n",inode->i_ino); @@ -32,11 +31,15 @@ static int affs_symlink_readpage(struct file *file, struct page *page) j = 0; lf = (struct slink_front *)bh->b_data; lc = 0; - pf = AFFS_SB(inode->i_sb)->s_prefix ? AFFS_SB(inode->i_sb)->s_prefix : "/"; if (strchr(lf->symname,':')) { /* Handle assign or volume name */ + struct affs_sb_info *sbi = AFFS_SB(inode->i_sb); + char *pf; + spin_lock(&sbi->symlink_lock); + pf = sbi->s_prefix ? sbi->s_prefix : "/"; while (i < 1023 && (c = pf[i])) link[i++] = c; + spin_unlock(&sbi->symlink_lock); while (i < 1023 && lf->symname[j] != ':') link[i++] = lf->symname[j++]; if (i < 1023) diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 2c994591f4d..9f0bf13291e 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -121,13 +121,13 @@ struct file *anon_inode_getfile(const char *name, d_instantiate(path.dentry, anon_inode_inode); error = -ENFILE; - file = alloc_file(&path, FMODE_READ | FMODE_WRITE, fops); + file = alloc_file(&path, OPEN_FMODE(flags), fops); if (!file) goto err_dput; file->f_mapping = anon_inode_inode->i_mapping; file->f_pos = 0; - file->f_flags = O_RDWR | (flags & O_NONBLOCK); + file->f_flags = flags & (O_ACCMODE | O_NONBLOCK); file->f_version = 0; file->private_data = priv; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 33baf27fac7..34ddda888e6 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -873,6 +873,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); unacquire_priv_sbp: + kfree(befs_sb->mount_opts.iocharset); kfree(sb->s_fs_info); unacquire_none: diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 6f60336c662..8f3d9fd8960 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -353,35 +353,35 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct inode *inode; unsigned i, imap_len; struct bfs_sb_info *info; - long ret = -EINVAL; + int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) return -ENOMEM; + mutex_init(&info->bfs_lock); s->s_fs_info = info; sb_set_blocksize(s, BFS_BSIZE); - bh = sb_bread(s, 0); - if(!bh) + info->si_sbh = sb_bread(s, 0); + if (!info->si_sbh) goto out; - bfs_sb = (struct bfs_super_block *)bh->b_data; + bfs_sb = (struct bfs_super_block *)info->si_sbh->b_data; if (le32_to_cpu(bfs_sb->s_magic) != BFS_MAGIC) { if (!silent) printf("No BFS filesystem on %s (magic=%08x)\n", s->s_id, le32_to_cpu(bfs_sb->s_magic)); - goto out; + goto out1; } if (BFS_UNCLEAN(bfs_sb, s) && !silent) printf("%s is unclean, continuing\n", s->s_id); s->s_magic = BFS_MAGIC; - info->si_sbh = bh; if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { printf("Superblock is corrupted\n"); - goto out; + goto out1; } info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / @@ -390,7 +390,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) imap_len = (info->si_lasti / 8) + 1; info->si_imap = kzalloc(imap_len, GFP_KERNEL); if (!info->si_imap) - goto out; + goto out1; for (i = 0; i < BFS_ROOT_INO; i++) set_bit(i, info->si_imap); @@ -398,15 +398,13 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) inode = bfs_iget(s, BFS_ROOT_INO); if (IS_ERR(inode)) { ret = PTR_ERR(inode); - kfree(info->si_imap); - goto out; + goto out2; } s->s_root = d_alloc_root(inode); if (!s->s_root) { iput(inode); ret = -ENOMEM; - kfree(info->si_imap); - goto out; + goto out2; } info->si_blocks = (le32_to_cpu(bfs_sb->s_end) + 1) >> BFS_BSIZE_BITS; @@ -419,10 +417,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) bh = sb_bread(s, info->si_blocks - 1); if (!bh) { printf("Last block not available: %lu\n", info->si_blocks - 1); - iput(inode); ret = -EIO; - kfree(info->si_imap); - goto out; + goto out3; } brelse(bh); @@ -459,11 +455,8 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) printf("Inode 0x%08x corrupted\n", i); brelse(bh); - s->s_root = NULL; - kfree(info->si_imap); - kfree(info); - s->s_fs_info = NULL; - return -EIO; + ret = -EIO; + goto out3; } if (!di->i_ino) { @@ -483,11 +476,17 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_dirt = 1; } dump_imap("read_super", s); - mutex_init(&info->bfs_lock); return 0; +out3: + dput(s->s_root); + s->s_root = NULL; +out2: + kfree(info->si_imap); +out1: + brelse(info->si_sbh); out: - brelse(bh); + mutex_destroy(&info->bfs_lock); kfree(info); s->s_fs_info = NULL; return ret; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index b639dcf7c77..fdd39709917 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -32,7 +32,7 @@ static int load_aout_binary(struct linux_binprm *, struct pt_regs * regs); static int load_aout_library(struct file*); -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int aout_core_dump(struct coredump_params *cprm); static struct linux_binfmt aout_format = { .module = THIS_MODULE, @@ -89,8 +89,9 @@ if (file->f_op->llseek) { \ * dumping of the process results in another error.. */ -static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int aout_core_dump(struct coredump_params *cprm) { + struct file *file = cprm->file; mm_segment_t fs; int has_dumped = 0; unsigned long dump_start, dump_size; @@ -108,16 +109,16 @@ static int aout_core_dump(long signr, struct pt_regs *regs, struct file *file, u current->flags |= PF_DUMPCORE; strncpy(dump.u_comm, current->comm, sizeof(dump.u_comm)); dump.u_ar0 = offsetof(struct user, regs); - dump.signal = signr; - aout_dump_thread(regs, &dump); + dump.signal = cprm->signr; + aout_dump_thread(cprm->regs, &dump); /* If the size of the dump file exceeds the rlimit, then see what would happen if we wrote the stack, but not the data area. */ - if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > limit) + if ((dump.u_dsize + dump.u_ssize+1) * PAGE_SIZE > cprm->limit) dump.u_dsize = 0; /* Make sure we have enough room to write the stack and data areas. */ - if ((dump.u_ssize + 1) * PAGE_SIZE > limit) + if ((dump.u_ssize + 1) * PAGE_SIZE > cprm->limit) dump.u_ssize = 0; /* make sure we actually have a data and stack area to dump */ @@ -263,6 +264,7 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) #else set_personality(PER_LINUX); #endif + setup_new_exec(bprm); current->mm->end_code = ex.a_text + (current->mm->start_code = N_TXTADDR(ex)); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 97b6e9efeb7..fd5b2ea5d29 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -45,7 +45,7 @@ static unsigned long elf_map(struct file *, unsigned long, struct elf_phdr *, * don't even try. */ #ifdef CONFIG_ELF_CORE -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int elf_core_dump(struct coredump_params *cprm); #else #define elf_core_dump NULL #endif @@ -662,27 +662,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0') goto out_free_interp; - /* - * The early SET_PERSONALITY here is so that the lookup - * for the interpreter happens in the namespace of the - * to-be-execed image. SET_PERSONALITY can select an - * alternate root. - * - * However, SET_PERSONALITY is NOT allowed to switch - * this task into the new images's memory mapping - * policy - that is, TASK_SIZE must still evaluate to - * that which is appropriate to the execing application. - * This is because exit_mmap() needs to have TASK_SIZE - * evaluate to the size of the old image. - * - * So if (say) a 64-bit application is execing a 32-bit - * application it is the architecture's responsibility - * to defer changing the value of TASK_SIZE until the - * switch really is going to happen - do this in - * flush_thread(). - akpm - */ - SET_PERSONALITY(loc->elf_ex); - interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); if (IS_ERR(interpreter)) @@ -730,9 +709,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Verify the interpreter has a valid arch */ if (!elf_check_arch(&loc->interp_elf_ex)) goto out_free_dentry; - } else { - /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -752,7 +728,8 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) current->flags |= PF_RANDOMIZE; - arch_pick_mmap_layout(current->mm); + + setup_new_exec(bprm); /* Do this so that we can load the interpreter, if need be. We will change some of these later */ @@ -1272,8 +1249,9 @@ static int writenote(struct memelfnote *men, struct file *file, } #undef DUMP_WRITE -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static void fill_elf_header(struct elfhdr *elf, int segs, @@ -1901,7 +1879,7 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; mm_segment_t fs; @@ -1947,7 +1925,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un * notes. This also sets up the file header. */ if (!fill_note_info(elf, segs + 1, /* including notes section */ - &info, signr, regs)) + &info, cprm->signr, cprm->regs)) goto cleanup; has_dumped = 1; @@ -2009,14 +1987,14 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un #endif /* write out the notes section */ - if (!write_note_info(&info, file, &foffset)) + if (!write_note_info(&info, cprm->file, &foffset)) goto end_coredump; - if (elf_coredump_extra_notes_write(file, &foffset)) + if (elf_coredump_extra_notes_write(cprm->file, &foffset)) goto end_coredump; /* Align to page */ - if (!dump_seek(file, dataoff - foffset)) + if (!dump_seek(cprm->file, dataoff - foffset)) goto end_coredump; for (vma = first_vma(current, gate_vma); vma != NULL; @@ -2033,12 +2011,13 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un page = get_dump_page(addr); if (page) { void *kaddr = kmap(page); - stop = ((size += PAGE_SIZE) > limit) || - !dump_write(file, kaddr, PAGE_SIZE); + stop = ((size += PAGE_SIZE) > cprm->limit) || + !dump_write(cprm->file, kaddr, + PAGE_SIZE); kunmap(page); page_cache_release(page); } else - stop = !dump_seek(file, PAGE_SIZE); + stop = !dump_seek(cprm->file, PAGE_SIZE); if (stop) goto end_coredump; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 7b055385db8..18d77297ccc 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -76,7 +76,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *, struct file *, struct mm_struct *); #ifdef CONFIG_ELF_CORE -static int elf_fdpic_core_dump(long, struct pt_regs *, struct file *, unsigned long limit); +static int elf_fdpic_core_dump(struct coredump_params *cprm); #endif static struct linux_binfmt elf_fdpic_format = { @@ -171,6 +171,9 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, #ifdef ELF_FDPIC_PLAT_INIT unsigned long dynaddr; #endif +#ifndef CONFIG_MMU + unsigned long stack_prot; +#endif struct file *interpreter = NULL; /* to shut gcc up */ char *interpreter_name = NULL; int executable_stack; @@ -316,6 +319,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, * defunct, deceased, etc. after this point we have to exit via * error_kill */ set_personality(PER_LINUX_FDPIC); + if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) + current->personality |= READ_IMPLIES_EXEC; + + setup_new_exec(bprm); + set_binfmt(&elf_fdpic_format); current->mm->start_code = 0; @@ -377,9 +385,13 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, if (stack_size < PAGE_SIZE * 2) stack_size = PAGE_SIZE * 2; + stack_prot = PROT_READ | PROT_WRITE; + if (executable_stack == EXSTACK_ENABLE_X || + (executable_stack == EXSTACK_DEFAULT && VM_STACK_FLAGS & VM_EXEC)) + stack_prot |= PROT_EXEC; + down_write(¤t->mm->mmap_sem); - current->mm->start_brk = do_mmap(NULL, 0, stack_size, - PROT_READ | PROT_WRITE | PROT_EXEC, + current->mm->start_brk = do_mmap(NULL, 0, stack_size, stack_prot, MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED | MAP_GROWSDOWN, 0); @@ -1326,8 +1338,9 @@ static int writenote(struct memelfnote *men, struct file *file) #undef DUMP_WRITE #undef DUMP_SEEK -#define DUMP_WRITE(addr, nr) \ - if ((size += (nr)) > limit || !dump_write(file, (addr), (nr))) \ +#define DUMP_WRITE(addr, nr) \ + if ((size += (nr)) > cprm->limit || \ + !dump_write(cprm->file, (addr), (nr))) \ goto end_coredump; static inline void fill_elf_fdpic_header(struct elfhdr *elf, int segs) @@ -1582,8 +1595,7 @@ static int elf_fdpic_dump_segments(struct file *file, size_t *size, * and then they are actually written out. If we run out of core limit * we just truncate. */ -static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, - struct file *file, unsigned long limit) +static int elf_fdpic_core_dump(struct coredump_params *cprm) { #define NUM_NOTES 6 int has_dumped = 0; @@ -1642,7 +1654,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, goto cleanup; #endif - if (signr) { + if (cprm->signr) { struct core_thread *ct; struct elf_thread_status *tmp; @@ -1661,14 +1673,14 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, int sz; tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, tmp); + sz = elf_dump_thread_status(cprm->signr, tmp); thread_status_size += sz; } } /* now collect the dump for the current */ - fill_prstatus(prstatus, current, signr); - elf_core_copy_regs(&prstatus->pr_reg, regs); + fill_prstatus(prstatus, current, cprm->signr); + elf_core_copy_regs(&prstatus->pr_reg, cprm->regs); segs = current->mm->map_count; #ifdef ELF_CORE_EXTRA_PHDRS @@ -1703,7 +1715,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* Try to dump the FPU. */ if ((prstatus->pr_fpvalid = - elf_core_copy_task_fpregs(current, regs, fpu))) + elf_core_copy_task_fpregs(current, cprm->regs, fpu))) fill_note(notes + numnote++, "CORE", NT_PRFPREG, sizeof(*fpu), fpu); #ifdef ELF_CORE_COPY_XFPREGS @@ -1774,7 +1786,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, /* write out the notes section */ for (i = 0; i < numnote; i++) - if (!writenote(notes + i, file)) + if (!writenote(notes + i, cprm->file)) goto end_coredump; /* write out the thread status notes section */ @@ -1783,25 +1795,26 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, list_entry(t, struct elf_thread_status, list); for (i = 0; i < tmp->num_notes; i++) - if (!writenote(&tmp->notes[i], file)) + if (!writenote(&tmp->notes[i], cprm->file)) goto end_coredump; } - if (!dump_seek(file, dataoff)) + if (!dump_seek(cprm->file, dataoff)) goto end_coredump; - if (elf_fdpic_dump_segments(file, &size, &limit, mm_flags) < 0) + if (elf_fdpic_dump_segments(cprm->file, &size, &cprm->limit, + mm_flags) < 0) goto end_coredump; #ifdef ELF_CORE_WRITE_EXTRA_DATA ELF_CORE_WRITE_EXTRA_DATA; #endif - if (file->f_pos != offset) { + if (cprm->file->f_pos != offset) { /* Sanity check */ printk(KERN_WARNING "elf_core_dump: file->f_pos (%lld) != offset (%lld)\n", - file->f_pos, offset); + cprm->file->f_pos, offset); } end_coredump: diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index a2796651e75..42c6b4a5444 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -87,7 +87,7 @@ static int load_flat_shared_library(int id, struct lib_info *p); #endif static int load_flat_binary(struct linux_binprm *, struct pt_regs * regs); -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit); +static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, @@ -102,10 +102,10 @@ static struct linux_binfmt flat_format = { * Currently only a stub-function. */ -static int flat_core_dump(long signr, struct pt_regs *regs, struct file *file, unsigned long limit) +static int flat_core_dump(struct coredump_params *cprm) { printk("Process %s:%d received signr %d and should have core dumped\n", - current->comm, current->pid, (int) signr); + current->comm, current->pid, (int) cprm->signr); return(1); } @@ -519,6 +519,7 @@ static int load_flat_file(struct linux_binprm * bprm, /* OK, This is the point of no return */ set_personality(PER_LINUX_32BIT); + setup_new_exec(bprm); } /* diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index eff74b9c9e7..cc8560f6c9b 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -43,7 +43,7 @@ static int load_som_library(struct file *); * don't even try. */ #if 0 -static int som_core_dump(long signr, struct pt_regs *regs, unsigned long limit); +static int som_core_dump(struct coredump_params *cprm); #else #define som_core_dump NULL #endif @@ -227,6 +227,7 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->personality = PER_HPUX; + setup_new_exec(bprm); /* Set the task size for HP-UX processes such that * the gateway page is outside the address space. diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 49a34e7f730..a16f29e888c 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -61,7 +61,7 @@ static inline unsigned int vecs_to_idx(unsigned int nr) static inline int use_bip_pool(unsigned int idx) { - if (idx == BIOVEC_NR_POOLS) + if (idx == BIOVEC_MAX_IDX) return 1; return 0; @@ -95,6 +95,7 @@ struct bio_integrity_payload *bio_integrity_alloc_bioset(struct bio *bio, /* Use mempool if lower order alloc failed or max vecs were requested */ if (bip == NULL) { + idx = BIOVEC_MAX_IDX; /* so we free the payload properly later */ bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask); if (unlikely(bip == NULL)) { @@ -78,7 +78,7 @@ static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) i = 0; while (i < bio_slab_nr) { - struct bio_slab *bslab = &bio_slabs[i]; + bslab = &bio_slabs[i]; if (!bslab->slab && entry == -1) entry = i; @@ -542,13 +542,18 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page if (page == prev->bv_page && offset == prev->bv_offset + prev->bv_len) { + unsigned int prev_bv_len = prev->bv_len; prev->bv_len += len; if (q->merge_bvec_fn) { struct bvec_merge_data bvm = { + /* prev_bvec is already charged in + bi_size, discharge it in order to + simulate merging updated prev_bvec + as new bvec. */ .bi_bdev = bio->bi_bdev, .bi_sector = bio->bi_sector, - .bi_size = bio->bi_size, + .bi_size = bio->bi_size - prev_bv_len, .bi_rw = bio->bi_rw, }; diff --git a/fs/block_dev.c b/fs/block_dev.c index 73d6a735b8f..d11d0289f3d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -246,7 +246,8 @@ struct super_block *freeze_bdev(struct block_device *bdev) if (!sb) goto out; if (sb->s_flags & MS_RDONLY) { - deactivate_locked_super(sb); + sb->s_frozen = SB_FREEZE_TRANS; + up_write(&sb->s_umount); mutex_unlock(&bdev->bd_fsfreeze_mutex); return sb; } @@ -307,7 +308,7 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) BUG_ON(sb->s_bdev != bdev); down_write(&sb->s_umount); if (sb->s_flags & MS_RDONLY) - goto out_deactivate; + goto out_unfrozen; if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -321,11 +322,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) } } +out_unfrozen: sb->s_frozen = SB_UNFROZEN; smp_wmb(); wake_up(&sb->s_wait_unfrozen); -out_deactivate: if (sb) deactivate_locked_super(sb); out_unlock: diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 402afe0a0bf..7bb3c020e57 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -4,7 +4,6 @@ config BTRFS_FS select LIBCRC32C select ZLIB_INFLATE select ZLIB_DEFLATE - select FS_JOURNAL_INFO help Btrfs is a new filesystem with extents, writable snapshotting, support for multiple devices and many more features. diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 52cbe47022b..6df6d6ed74f 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -94,7 +94,8 @@ static int btrfs_xattr_acl_get(struct dentry *dentry, const char *name, /* * Needs to be called with fs_mutex held */ -static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +static int btrfs_set_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct posix_acl *acl, int type) { int ret, size = 0; const char *name; @@ -111,12 +112,14 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: mode = inode->i_mode; - ret = posix_acl_equiv_mode(acl, &mode); - if (ret < 0) - return ret; - ret = 0; - inode->i_mode = mode; name = POSIX_ACL_XATTR_ACCESS; + if (acl) { + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + inode->i_mode = mode; + } + ret = 0; break; case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) @@ -140,8 +143,7 @@ static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) goto out; } - ret = __btrfs_setxattr(inode, name, value, size, 0); - + ret = __btrfs_setxattr(trans, inode, name, value, size, 0); out: kfree(value); @@ -154,7 +156,7 @@ out: static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, const void *value, size_t size, int flags, int type) { - int ret = 0; + int ret; struct posix_acl *acl = NULL; if (value) { @@ -167,7 +169,7 @@ static int btrfs_xattr_acl_set(struct dentry *dentry, const char *name, } } - ret = btrfs_set_acl(dentry->d_inode, acl, type); + ret = btrfs_set_acl(NULL, dentry->d_inode, acl, type); posix_acl_release(acl); @@ -196,7 +198,8 @@ int btrfs_check_acl(struct inode *inode, int mask) * stuff has been fixed to work with that. If the locking stuff changes, we * need to re-evaluate the acl locking stuff. */ -int btrfs_init_acl(struct inode *inode, struct inode *dir) +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { struct posix_acl *acl = NULL; int ret = 0; @@ -221,7 +224,8 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) mode_t mode; if (S_ISDIR(inode->i_mode)) { - ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + ret = btrfs_set_acl(trans, inode, acl, + ACL_TYPE_DEFAULT); if (ret) goto failed; } @@ -236,10 +240,11 @@ int btrfs_init_acl(struct inode *inode, struct inode *dir) inode->i_mode = mode; if (ret > 0) { /* we need an acl */ - ret = btrfs_set_acl(inode, clone, + ret = btrfs_set_acl(trans, inode, clone, ACL_TYPE_ACCESS); } } + posix_acl_release(clone); } failed: posix_acl_release(acl); @@ -269,7 +274,7 @@ int btrfs_acl_chmod(struct inode *inode) ret = posix_acl_chmod_masq(clone, inode->i_mode); if (!ret) - ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + ret = btrfs_set_acl(NULL, inode, clone, ACL_TYPE_ACCESS); posix_acl_release(clone); @@ -297,7 +302,8 @@ int btrfs_acl_chmod(struct inode *inode) return 0; } -int btrfs_init_acl(struct inode *inode, struct inode *dir) +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { return 0; } diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index f6783a42f01..3f1f50d9d91 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,9 +44,6 @@ struct btrfs_inode { */ struct extent_io_tree io_failure_tree; - /* held while inesrting or deleting extents from files */ - struct mutex extent_mutex; - /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -166,7 +163,7 @@ static inline struct btrfs_inode *BTRFS_I(struct inode *inode) static inline void btrfs_i_size_write(struct inode *inode, u64 size) { - inode->i_size = size; + i_size_write(inode, size); BTRFS_I(inode)->disk_i_size = size; } diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ec96f3a6d53..c4bc570a396 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -37,6 +37,11 @@ static int balance_node_right(struct btrfs_trans_handle *trans, struct extent_buffer *src_buf); static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); +static int setup_items_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + u32 total_data, u32 total_size, int nr); + struct btrfs_path *btrfs_alloc_path(void) { @@ -451,9 +456,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, extent_buffer_get(cow); spin_unlock(&root->node_lock); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); free_extent_buffer(buf); add_root_to_dirty_list(root); } else { @@ -468,9 +472,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); - btrfs_free_extent(trans, root, buf->start, buf->len, - parent_start, root->root_key.objectid, - level, 0); + btrfs_free_tree_block(trans, root, buf->start, buf->len, + parent_start, root->root_key.objectid, level); } if (unlock_orig) btrfs_tree_unlock(buf); @@ -1030,8 +1033,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_unlock(mid); /* once for the path */ free_extent_buffer(mid); - ret = btrfs_free_extent(trans, root, mid->start, mid->len, - 0, root->root_key.objectid, level, 1); + ret = btrfs_free_tree_block(trans, root, mid->start, mid->len, + 0, root->root_key.objectid, level); /* once for the root ptr */ free_extent_buffer(mid); return ret; @@ -1095,10 +1098,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, 1); if (wret) ret = wret; - wret = btrfs_free_extent(trans, root, bytenr, - blocksize, 0, - root->root_key.objectid, - level, 0); + wret = btrfs_free_tree_block(trans, root, + bytenr, blocksize, 0, + root->root_key.objectid, + level); if (wret) ret = wret; } else { @@ -1143,9 +1146,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, wret = del_ptr(trans, root, path, level + 1, pslot); if (wret) ret = wret; - wret = btrfs_free_extent(trans, root, bytenr, blocksize, - 0, root->root_key.objectid, - level, 0); + wret = btrfs_free_tree_block(trans, root, bytenr, blocksize, + 0, root->root_key.objectid, level); if (wret) ret = wret; } else { @@ -2997,75 +2999,85 @@ again: return ret; } -/* - * This function splits a single item into two items, - * giving 'new_key' to the new item and splitting the - * old one at split_offset (from the start of the item). - * - * The path may be released by this operation. After - * the split, the path is pointing to the old item. The - * new item is going to be in the same node as the old one. - * - * Note, the item being split must be smaller enough to live alone on - * a tree block with room for one extra struct btrfs_item - * - * This allows us to split the item in place, keeping a lock on the - * leaf the entire time. - */ -int btrfs_split_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_key *new_key, - unsigned long split_offset) +static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int ins_len) { - u32 item_size; + struct btrfs_key key; struct extent_buffer *leaf; - struct btrfs_key orig_key; - struct btrfs_item *item; - struct btrfs_item *new_item; - int ret = 0; - int slot; - u32 nritems; - u32 orig_offset; - struct btrfs_disk_key disk_key; - char *buf; + struct btrfs_file_extent_item *fi; + u64 extent_len = 0; + u32 item_size; + int ret; leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); - if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) - goto split; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY && + key.type != BTRFS_EXTENT_CSUM_KEY); + + if (btrfs_leaf_free_space(root, leaf) >= ins_len) + return 0; item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_len = btrfs_file_extent_num_bytes(leaf, fi); + } btrfs_release_path(root, path); - path->search_for_split = 1; path->keep_locks = 1; - - ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); + path->search_for_split = 1; + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); path->search_for_split = 0; + if (ret < 0) + goto err; + ret = -EAGAIN; + leaf = path->nodes[0]; /* if our item isn't there or got smaller, return now */ - if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], - path->slots[0])) { - path->keep_locks = 0; - return -EAGAIN; + if (ret > 0 || item_size != btrfs_item_size_nr(leaf, path->slots[0])) + goto err; + + if (key.type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (extent_len != btrfs_file_extent_num_bytes(leaf, fi)) + goto err; } btrfs_set_path_blocking(path); - ret = split_leaf(trans, root, &orig_key, path, - sizeof(struct btrfs_item), 1); - path->keep_locks = 0; + ret = split_leaf(trans, root, &key, path, ins_len, 1); BUG_ON(ret); + path->keep_locks = 0; btrfs_unlock_up_safe(path, 1); + return 0; +err: + path->keep_locks = 0; + return ret; +} + +static noinline int split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_item *new_item; + int slot; + char *buf; + u32 nritems; + u32 item_size; + u32 orig_offset; + struct btrfs_disk_key disk_key; + leaf = path->nodes[0]; BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); -split: - /* - * make sure any changes to the path from split_leaf leave it - * in a blocking state - */ btrfs_set_path_blocking(path); item = btrfs_item_nr(leaf, path->slots[0]); @@ -3073,19 +3085,19 @@ split: item_size = btrfs_item_size(leaf, item); buf = kmalloc(item_size, GFP_NOFS); + if (!buf) + return -ENOMEM; + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, path->slots[0]), item_size); - slot = path->slots[0] + 1; - leaf = path->nodes[0]; + slot = path->slots[0] + 1; nritems = btrfs_header_nritems(leaf); - if (slot != nritems) { /* shift the items */ memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), - btrfs_item_nr_offset(slot), - (nritems - slot) * sizeof(struct btrfs_item)); - + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); } btrfs_cpu_key_to_disk(&disk_key, new_key); @@ -3113,16 +3125,81 @@ split: item_size - split_offset); btrfs_mark_buffer_dirty(leaf); - ret = 0; - if (btrfs_leaf_free_space(root, leaf) < 0) { - btrfs_print_leaf(root, leaf); - BUG(); - } + BUG_ON(btrfs_leaf_free_space(root, leaf) < 0); kfree(buf); + return 0; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + int ret; + ret = setup_leaf_for_split(trans, root, path, + sizeof(struct btrfs_item)); + if (ret) + return ret; + + ret = split_item(trans, root, path, new_key, split_offset); return ret; } /* + * This function duplicate a item, giving 'new_key' to the new item. + * It guarantees both items live in the same tree leaf and the new item + * is contiguous with the original item. + * + * This allows us to split file extent in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct extent_buffer *leaf; + int ret; + u32 item_size; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ret = setup_leaf_for_split(trans, root, path, + item_size + sizeof(struct btrfs_item)); + if (ret) + return ret; + + path->slots[0]++; + ret = setup_items_for_insert(trans, root, path, new_key, &item_size, + item_size, item_size + + sizeof(struct btrfs_item), 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + memcpy_extent_buffer(leaf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + btrfs_item_ptr_offset(leaf, path->slots[0] - 1), + item_size); + return 0; +} + +/* * make the item pointed to by the path smaller. new_size indicates * how small to make it, and from_end tells us if we just chop bytes * off the end of the item or if we shift the item to chop bytes off @@ -3714,8 +3791,8 @@ static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, */ btrfs_unlock_up_safe(path, 0); - ret = btrfs_free_extent(trans, root, leaf->start, leaf->len, - 0, root->root_key.objectid, 0, 0); + ret = btrfs_free_tree_block(trans, root, leaf->start, leaf->len, + 0, root->root_key.objectid, 0); return ret; } /* diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 444b3e9b92a..2aa8ec6a098 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -310,6 +310,9 @@ struct btrfs_header { #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ sizeof(struct btrfs_file_extent_item)) +#define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) -\ + sizeof(struct btrfs_dir_item)) /* @@ -859,8 +862,9 @@ struct btrfs_fs_info { struct mutex ordered_operations_mutex; struct rw_semaphore extent_commit_sem; - struct rw_semaphore subvol_sem; + struct rw_semaphore cleanup_work_sem; + struct rw_semaphore subvol_sem; struct srcu_struct subvol_srcu; struct list_head trans_list; @@ -868,6 +872,9 @@ struct btrfs_fs_info { struct list_head dead_roots; struct list_head caching_block_groups; + spinlock_t delayed_iput_lock; + struct list_head delayed_iputs; + atomic_t nr_async_submits; atomic_t async_submit_draining; atomic_t nr_async_bios; @@ -1034,12 +1041,12 @@ struct btrfs_root { int ref_cows; int track_dirty; int in_radix; + int clean_orphans; u64 defrag_trans_start; struct btrfs_key defrag_progress; struct btrfs_key defrag_max; int defrag_running; - int defrag_level; char *name; int in_sysfs; @@ -1154,6 +1161,7 @@ struct btrfs_root { #define BTRFS_MOUNT_SSD_SPREAD (1 << 8) #define BTRFS_MOUNT_NOSSD (1 << 9) #define BTRFS_MOUNT_DISCARD (1 << 10) +#define BTRFS_MOUNT_FORCE_COMPRESS (1 << 11) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -1975,6 +1983,10 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -2089,6 +2101,10 @@ int btrfs_split_item(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_key *new_key, unsigned long split_offset); +int btrfs_duplicate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key); int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_key *key, struct btrfs_path *p, int ins_len, int cow); @@ -2196,9 +2212,10 @@ int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, struct btrfs_path *path, struct btrfs_dir_item *di); int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - u16 name_len, const void *data, u16 data_len, - u64 dir); + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len); struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 dir, @@ -2292,7 +2309,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput); int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); @@ -2332,6 +2349,8 @@ int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_cleanup(struct btrfs_root *root); int btrfs_cont_expand(struct inode *inode, loff_t size); int btrfs_invalidate_inodes(struct btrfs_root *root); +void btrfs_add_delayed_iput(struct inode *inode); +void btrfs_run_delayed_iputs(struct btrfs_root *root); extern const struct dentry_operations btrfs_dentry_operations; /* ioctl.c */ @@ -2345,12 +2364,9 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, int skip_pinned); int btrfs_check_file(struct btrfs_root *root, struct inode *inode); extern const struct file_operations btrfs_file_operations; -int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_block, int drop_cache); +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, u64 end); int btrfs_release_file(struct inode *inode, struct file *file); @@ -2380,7 +2396,8 @@ int btrfs_check_acl(struct inode *inode, int mask); #else #define btrfs_check_acl NULL #endif -int btrfs_init_acl(struct inode *inode, struct inode *dir); +int btrfs_init_acl(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); int btrfs_acl_chmod(struct inode *inode); /* relocation.c */ diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index f3a6075519c..e9103b3baa4 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -68,12 +68,12 @@ static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle * into the tree */ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, - struct btrfs_root *root, const char *name, - u16 name_len, const void *data, u16 data_len, - u64 dir) + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + const char *name, u16 name_len, + const void *data, u16 data_len) { int ret = 0; - struct btrfs_path *path; struct btrfs_dir_item *dir_item; unsigned long name_ptr, data_ptr; struct btrfs_key key, location; @@ -81,15 +81,11 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; u32 data_size; - key.objectid = dir; + BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); + + key.objectid = objectid; btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); key.offset = btrfs_name_hash(name, name_len); - path = btrfs_alloc_path(); - if (!path) - return -ENOMEM; - if (name_len + data_len + sizeof(struct btrfs_dir_item) > - BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) - return -ENOSPC; data_size = sizeof(*dir_item) + name_len + data_len; dir_item = insert_with_overflow(trans, root, path, &key, data_size, @@ -117,7 +113,6 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, write_extent_buffer(leaf, data, data_ptr, data_len); btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 02b6afbd745..2b59201b955 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -892,6 +892,8 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->stripesize = stripesize; root->ref_cows = 0; root->track_dirty = 0; + root->in_radix = 0; + root->clean_orphans = 0; root->fs_info = fs_info; root->objectid = objectid; @@ -928,7 +930,6 @@ static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, root->defrag_trans_start = fs_info->generation; init_completion(&root->kobj_unregister); root->defrag_running = 0; - root->defrag_level = 0; root->root_key.objectid = objectid; root->anon_super.s_root = NULL; root->anon_super.s_dev = 0; @@ -980,12 +981,12 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, while (1) { ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); if (ret) break; - clear_extent_dirty(&log_root_tree->dirty_log_pages, - start, end, GFP_NOFS); + clear_extent_bits(&log_root_tree->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } eb = fs_info->log_root_tree->node; @@ -1210,8 +1211,10 @@ again: ret = radix_tree_insert(&fs_info->fs_roots_radix, (unsigned long)root->root_key.objectid, root); - if (ret == 0) + if (ret == 0) { root->in_radix = 1; + root->clean_orphans = 1; + } spin_unlock(&fs_info->fs_roots_radix_lock); radix_tree_preload_end(); if (ret) { @@ -1225,10 +1228,6 @@ again: ret = btrfs_find_dead_roots(fs_info->tree_root, root->root_key.objectid); WARN_ON(ret); - - if (!(fs_info->sb->s_flags & MS_RDONLY)) - btrfs_orphan_cleanup(root); - return root; fail: free_fs_root(root); @@ -1477,6 +1476,7 @@ static int cleaner_kthread(void *arg) if (!(root->fs_info->sb->s_flags & MS_RDONLY) && mutex_trylock(&root->fs_info->cleaner_mutex)) { + btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); } @@ -1606,6 +1606,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->delayed_iputs); INIT_LIST_HEAD(&fs_info->hashers); INIT_LIST_HEAD(&fs_info->delalloc_inodes); INIT_LIST_HEAD(&fs_info->ordered_operations); @@ -1614,6 +1615,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, spin_lock_init(&fs_info->new_trans_lock); spin_lock_init(&fs_info->ref_cache_lock); spin_lock_init(&fs_info->fs_roots_radix_lock); + spin_lock_init(&fs_info->delayed_iput_lock); init_completion(&fs_info->kobj_unregister); fs_info->tree_root = tree_root; @@ -1689,6 +1691,7 @@ struct btrfs_root *open_ctree(struct super_block *sb, mutex_init(&fs_info->cleaner_mutex); mutex_init(&fs_info->volume_mutex); init_rwsem(&fs_info->extent_commit_sem); + init_rwsem(&fs_info->cleanup_work_sem); init_rwsem(&fs_info->subvol_sem); btrfs_init_free_cluster(&fs_info->meta_alloc_cluster); @@ -1979,7 +1982,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!(sb->s_flags & MS_RDONLY)) { ret = btrfs_recover_relocation(tree_root); - BUG_ON(ret); + if (ret < 0) { + printk(KERN_WARNING + "btrfs: failed to recover relocation\n"); + err = -EINVAL; + goto fail_trans_kthread; + } } location.objectid = BTRFS_FS_TREE_OBJECTID; @@ -1990,6 +1998,12 @@ struct btrfs_root *open_ctree(struct super_block *sb, if (!fs_info->fs_root) goto fail_trans_kthread; + if (!(sb->s_flags & MS_RDONLY)) { + down_read(&fs_info->cleanup_work_sem); + btrfs_orphan_cleanup(fs_info->fs_root); + up_read(&fs_info->cleanup_work_sem); + } + return tree_root; fail_trans_kthread: @@ -2386,8 +2400,14 @@ int btrfs_commit_super(struct btrfs_root *root) int ret; mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_run_delayed_iputs(root); btrfs_clean_old_snapshots(root); mutex_unlock(&root->fs_info->cleaner_mutex); + + /* wait until ongoing cleanup work done */ + down_write(&root->fs_info->cleanup_work_sem); + up_write(&root->fs_info->cleanup_work_sem); + trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); BUG_ON(ret); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 94627c4cc19..559f72489b3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -83,6 +83,17 @@ static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) return (cache->flags & bits) == bits; } +void btrfs_get_block_group(struct btrfs_block_group_cache *cache) +{ + atomic_inc(&cache->count); +} + +void btrfs_put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + /* * this adds the block group to the fs_info rb tree for the block group * cache @@ -156,7 +167,7 @@ block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, } } if (ret) - atomic_inc(&ret->count); + btrfs_get_block_group(ret); spin_unlock(&info->block_group_cache_lock); return ret; @@ -195,6 +206,14 @@ static int exclude_super_stripes(struct btrfs_root *root, int stripe_len; int i, nr, ret; + if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) { + stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid; + cache->bytes_super += stripe_len; + ret = add_excluded_extent(root, cache->key.objectid, + stripe_len); + BUG_ON(ret); + } + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); ret = btrfs_rmap_block(&root->fs_info->mapping_tree, @@ -255,7 +274,7 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, if (ret) break; - if (extent_start == start) { + if (extent_start <= start) { start = extent_end + 1; } else if (extent_start > start && extent_start < end) { size = extent_start - start; @@ -399,6 +418,8 @@ err: put_caching_control(caching_ctl); atomic_dec(&block_group->space_info->caching_threads); + btrfs_put_block_group(block_group); + return 0; } @@ -439,6 +460,7 @@ static int cache_block_group(struct btrfs_block_group_cache *cache) up_write(&fs_info->extent_commit_sem); atomic_inc(&cache->space_info->caching_threads); + btrfs_get_block_group(cache); tsk = kthread_run(caching_kthread, cache, "btrfs-cache-%llu\n", cache->key.objectid); @@ -478,12 +500,6 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( return cache; } -void btrfs_put_block_group(struct btrfs_block_group_cache *cache) -{ - if (atomic_dec_and_test(&cache->count)) - kfree(cache); -} - static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, u64 flags) { @@ -2574,7 +2590,7 @@ next_block_group(struct btrfs_root *root, if (node) { cache = rb_entry(node, struct btrfs_block_group_cache, cache_node); - atomic_inc(&cache->count); + btrfs_get_block_group(cache); } else cache = NULL; spin_unlock(&root->fs_info->block_group_cache_lock); @@ -2880,9 +2896,9 @@ static noinline void flush_delalloc_async(struct btrfs_work *work) root = async->root; info = async->info; - btrfs_start_delalloc_inodes(root); + btrfs_start_delalloc_inodes(root, 0); wake_up(&info->flush_wait); - btrfs_wait_ordered_extents(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); spin_lock(&info->lock); info->flushing = 0; @@ -2956,8 +2972,8 @@ static void flush_delalloc(struct btrfs_root *root, return; flush: - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); spin_lock(&info->lock); info->flushing = 0; @@ -3454,14 +3470,6 @@ static int update_block_group(struct btrfs_trans_handle *trans, else old_val -= num_bytes; btrfs_set_super_bytes_used(&info->super_copy, old_val); - - /* block accounting for root item */ - old_val = btrfs_root_used(&root->root_item); - if (alloc) - old_val += num_bytes; - else - old_val -= num_bytes; - btrfs_set_root_used(&root->root_item, old_val); spin_unlock(&info->delalloc_lock); while (total) { @@ -4049,6 +4057,21 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, return ret; } +int btrfs_free_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize, + u64 parent, u64 root_objectid, int level) +{ + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) - blocksize; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + + return btrfs_free_extent(trans, root, bytenr, blocksize, + parent, root_objectid, level, 0); +} + static u64 stripe_align(struct btrfs_root *root, u64 val) { u64 mask = ((u64)root->stripesize - 1); @@ -4212,7 +4235,7 @@ search: u64 offset; int cached; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); search_start = block_group->key.objectid; have_block_group: @@ -4300,7 +4323,7 @@ have_block_group: btrfs_put_block_group(block_group); block_group = last_ptr->block_group; - atomic_inc(&block_group->count); + btrfs_get_block_group(block_group); spin_unlock(&last_ptr->lock); spin_unlock(&last_ptr->refill_lock); @@ -4578,7 +4601,6 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - struct btrfs_fs_info *info = root->fs_info; data = btrfs_get_alloc_profile(root, data); again: @@ -4586,17 +4608,9 @@ again: * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations */ - if (empty_size || root->ref_cows) { - if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { - ret = do_chunk_alloc(trans, root->fs_info->extent_root, - 2 * 1024 * 1024, - BTRFS_BLOCK_GROUP_METADATA | - (info->metadata_alloc_profile & - info->avail_metadata_alloc_bits), 0); - } + if (empty_size || root->ref_cows) ret = do_chunk_alloc(trans, root->fs_info->extent_root, num_bytes + 2 * 1024 * 1024, data, 0); - } WARN_ON(num_bytes < root->sectorsize); ret = find_free_extent(trans, root, num_bytes, empty_size, @@ -4897,6 +4911,14 @@ static int alloc_tree_block(struct btrfs_trans_handle *trans, extent_op); BUG_ON(ret); } + + if (root_objectid == root->root_key.objectid) { + u64 used; + spin_lock(&root->node_lock); + used = btrfs_root_used(&root->root_item) + num_bytes; + btrfs_set_root_used(&root->root_item, used); + spin_unlock(&root->node_lock); + } return ret; } @@ -4919,8 +4941,16 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { - set_extent_dirty(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, GFP_NOFS); + /* + * we allow two log transactions at a time, use different + * EXENT bit to differentiate dirty pages. + */ + if (root->log_transid % 2 == 0) + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + else + set_extent_new(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); } else { set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); @@ -5372,10 +5402,6 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, int ret; while (level >= 0) { - if (path->slots[level] >= - btrfs_header_nritems(path->nodes[level])) - break; - ret = walk_down_proc(trans, root, path, wc, lookup_info); if (ret > 0) break; @@ -5383,6 +5409,10 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans, if (level == 0) break; + if (path->slots[level] >= + btrfs_header_nritems(path->nodes[level])) + break; + ret = do_walk_down(trans, root, path, wc, &lookup_info); if (ret > 0) { path->slots[level]++; @@ -7373,9 +7403,7 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) wait_block_group_cache_done(block_group); btrfs_remove_free_space_cache(block_group); - - WARN_ON(atomic_read(&block_group->count) != 1); - kfree(block_group); + btrfs_put_block_group(block_group); spin_lock(&info->block_group_cache_lock); } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 96577e8bf9f..b177ed31961 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3165,10 +3165,9 @@ struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, spin_unlock(&tree->buffer_lock); goto free_eb; } - spin_unlock(&tree->buffer_lock); - /* add one reference for the tree */ atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); return eb; free_eb: diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 46bea0f4dc7..428fcac45f9 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -155,20 +155,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* - * look for an offset in the tree, and if it can't be found, return - * the first offset we can find smaller than 'offset'. - */ -static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) -{ - struct rb_node *prev; - struct rb_node *ret; - ret = __tree_search(root, offset, &prev, NULL); - if (!ret) - return prev; - return ret; -} - /* check to see if two extent_map structs are adjacent and safe to merge */ static int mergable_maps(struct extent_map *prev, struct extent_map *next) { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 77f759302e1..9d080962996 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -179,18 +179,14 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, } flags = em->flags; if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { - if (em->start <= start && - (!testend || em->start + em->len >= start + len)) { + if (testend && em->start + em->len >= start + len) { free_extent_map(em); write_unlock(&em_tree->lock); break; } - if (start < em->start) { - len = em->start - start; - } else { + start = em->start + em->len; + if (testend) len = start + len - (em->start + em->len); - start = em->start + em->len; - } free_extent_map(em); write_unlock(&em_tree->lock); continue; @@ -265,324 +261,253 @@ int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, * If an extent intersects the range but is not entirely inside the range * it is either truncated or split. Anything entirely inside the range * is deleted from the tree. - * - * inline_limit is used to tell this code which offsets in the file to keep - * if they contain inline extents. */ -noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - u64 start, u64 end, u64 locked_end, - u64 inline_limit, u64 *hint_byte, int drop_cache) +int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct inode *inode, + u64 start, u64 end, u64 *hint_byte, int drop_cache) { - u64 extent_end = 0; - u64 search_start = start; - u64 ram_bytes = 0; - u64 disk_bytenr = 0; - u64 orig_locked_end = locked_end; - u8 compression; - u8 encryption; - u16 other_encoding = 0; + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; - struct btrfs_file_extent_item *extent; + struct btrfs_file_extent_item *fi; struct btrfs_path *path; struct btrfs_key key; - struct btrfs_file_extent_item old; - int keep; - int slot; - int bookend; - int found_type = 0; - int found_extent; - int found_inline; + struct btrfs_key new_key; + u64 search_start = start; + u64 disk_bytenr = 0; + u64 num_bytes = 0; + u64 extent_offset = 0; + u64 extent_end = 0; + int del_nr = 0; + int del_slot = 0; + int extent_type; int recow; int ret; - inline_limit = 0; if (drop_cache) btrfs_drop_extent_cache(inode, start, end - 1, 0); path = btrfs_alloc_path(); if (!path) return -ENOMEM; + while (1) { recow = 0; - btrfs_release_path(root, path); ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, search_start, -1); if (ret < 0) - goto out; - if (ret > 0) { - if (path->slots[0] == 0) { - ret = 0; - goto out; - } - path->slots[0]--; + break; + if (ret > 0 && path->slots[0] > 0 && search_start == start) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == inode->i_ino && + key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; } + ret = 0; next_slot: - keep = 0; - bookend = 0; - found_extent = 0; - found_inline = 0; - compression = 0; - encryption = 0; - extent = NULL; leaf = path->nodes[0]; - slot = path->slots[0]; - ret = 0; - btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && - key.offset >= end) { - goto out; - } - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || - key.objectid != inode->i_ino) { - goto out; - } - if (recow) { - search_start = max(key.offset, start); - continue; - } - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { - extent = btrfs_item_ptr(leaf, slot, - struct btrfs_file_extent_item); - found_type = btrfs_file_extent_type(leaf, extent); - compression = btrfs_file_extent_compression(leaf, - extent); - encryption = btrfs_file_extent_encryption(leaf, - extent); - other_encoding = btrfs_file_extent_other_encoding(leaf, - extent); - if (found_type == BTRFS_FILE_EXTENT_REG || - found_type == BTRFS_FILE_EXTENT_PREALLOC) { - extent_end = - btrfs_file_extent_disk_bytenr(leaf, - extent); - if (extent_end) - *hint_byte = extent_end; - - extent_end = key.offset + - btrfs_file_extent_num_bytes(leaf, extent); - ram_bytes = btrfs_file_extent_ram_bytes(leaf, - extent); - found_extent = 1; - } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { - found_inline = 1; - extent_end = key.offset + - btrfs_file_extent_inline_len(leaf, extent); + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + BUG_ON(del_nr > 0); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + break; + if (ret > 0) { + ret = 0; + break; } + leaf = path->nodes[0]; + recow = 1; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid > inode->i_ino || + key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= end) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, fi); } else { + WARN_ON(1); extent_end = search_start; } - /* we found nothing we can drop */ - if ((!found_extent && !found_inline) || - search_start >= extent_end) { - int nextret; - u32 nritems; - nritems = btrfs_header_nritems(leaf); - if (slot >= nritems - 1) { - nextret = btrfs_next_leaf(root, path); - if (nextret) - goto out; - recow = 1; - } else { - path->slots[0]++; - } + if (extent_end <= search_start) { + path->slots[0]++; goto next_slot; } - if (end <= extent_end && start >= key.offset && found_inline) - *hint_byte = EXTENT_MAP_INLINE; - - if (found_extent) { - read_extent_buffer(leaf, &old, (unsigned long)extent, - sizeof(old)); - } - - if (end < extent_end && end >= key.offset) { - bookend = 1; - if (found_inline && start <= key.offset) - keep = 1; + search_start = max(key.offset, start); + if (recow) { + btrfs_release_path(root, path); + continue; } - if (bookend && found_extent) { - if (locked_end < extent_end) { - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, - GFP_NOFS); - if (!ret) { - btrfs_release_path(root, path); - lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, - GFP_NOFS); - locked_end = extent_end; - continue; - } - locked_end = extent_end; + /* + * | - range to drop - | + * | -------- extent -------- | + */ + if (start > key.offset && end < extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = start; + ret = btrfs_duplicate_item(trans, root, path, + &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + continue; } - disk_bytenr = le64_to_cpu(old.disk_bytenr); - if (disk_bytenr != 0) { + if (ret < 0) + break; + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + extent_offset += start - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - start); + btrfs_mark_buffer_dirty(leaf); + + if (disk_bytenr > 0) { ret = btrfs_inc_extent_ref(trans, root, - disk_bytenr, - le64_to_cpu(old.disk_num_bytes), 0, - root->root_key.objectid, - key.objectid, key.offset - - le64_to_cpu(old.offset)); + disk_bytenr, num_bytes, 0, + root->root_key.objectid, + new_key.objectid, + start - extent_offset); BUG_ON(ret); + *hint_byte = disk_bytenr; } + key.offset = start; } + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start <= key.offset && end < extent_end) { + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - if (found_inline) { - u64 mask = root->sectorsize - 1; - search_start = (extent_end + mask) & ~mask; - } else - search_start = extent_end; - - /* truncate existing extent */ - if (start > key.offset) { - u64 new_num; - u64 old_num; - keep = 1; - WARN_ON(start & (root->sectorsize - 1)); - if (found_extent) { - new_num = start - key.offset; - old_num = btrfs_file_extent_num_bytes(leaf, - extent); - *hint_byte = - btrfs_file_extent_disk_bytenr(leaf, - extent); - if (btrfs_file_extent_disk_bytenr(leaf, - extent)) { - inode_sub_bytes(inode, old_num - - new_num); - } - btrfs_set_file_extent_num_bytes(leaf, - extent, new_num); - btrfs_mark_buffer_dirty(leaf); - } else if (key.offset < inline_limit && - (end > extent_end) && - (inline_limit < extent_end)) { - u32 new_size; - new_size = btrfs_file_extent_calc_inline_size( - inline_limit - key.offset); - inode_sub_bytes(inode, extent_end - - inline_limit); - btrfs_set_file_extent_ram_bytes(leaf, extent, - new_size); - if (!compression && !encryption) { - btrfs_truncate_item(trans, root, path, - new_size, 1); - } + memcpy(&new_key, &key, sizeof(new_key)); + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + + extent_offset += end - key.offset; + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, end - key.offset); + *hint_byte = disk_bytenr; } + break; } - /* delete the entire extent */ - if (!keep) { - if (found_inline) - inode_sub_bytes(inode, extent_end - - key.offset); - ret = btrfs_del_item(trans, root, path); - /* TODO update progress marker and return */ - BUG_ON(ret); - extent = NULL; - btrfs_release_path(root, path); - /* the extent will be freed later */ - } - if (bookend && found_inline && start <= key.offset) { - u32 new_size; - new_size = btrfs_file_extent_calc_inline_size( - extent_end - end); - inode_sub_bytes(inode, end - key.offset); - btrfs_set_file_extent_ram_bytes(leaf, extent, - new_size); - if (!compression && !encryption) - ret = btrfs_truncate_item(trans, root, path, - new_size, 0); - BUG_ON(ret); - } - /* create bookend, splitting the extent in two */ - if (bookend && found_extent) { - struct btrfs_key ins; - ins.objectid = inode->i_ino; - ins.offset = end; - btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); - btrfs_release_path(root, path); - path->leave_spinning = 1; - ret = btrfs_insert_empty_item(trans, root, path, &ins, - sizeof(*extent)); - BUG_ON(ret); + search_start = extent_end; + /* + * | ---- range to drop ----- | + * | -------- extent -------- | + */ + if (start > key.offset && end >= extent_end) { + BUG_ON(del_nr > 0); + BUG_ON(extent_type == BTRFS_FILE_EXTENT_INLINE); - leaf = path->nodes[0]; - extent = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - write_extent_buffer(leaf, &old, - (unsigned long)extent, sizeof(old)); - - btrfs_set_file_extent_compression(leaf, extent, - compression); - btrfs_set_file_extent_encryption(leaf, extent, - encryption); - btrfs_set_file_extent_other_encoding(leaf, extent, - other_encoding); - btrfs_set_file_extent_offset(leaf, extent, - le64_to_cpu(old.offset) + end - key.offset); - WARN_ON(le64_to_cpu(old.num_bytes) < - (extent_end - end)); - btrfs_set_file_extent_num_bytes(leaf, extent, - extent_end - end); + btrfs_set_file_extent_num_bytes(leaf, fi, + start - key.offset); + btrfs_mark_buffer_dirty(leaf); + if (disk_bytenr > 0) { + inode_sub_bytes(inode, extent_end - start); + *hint_byte = disk_bytenr; + } + if (end == extent_end) + break; - /* - * set the ram bytes to the size of the full extent - * before splitting. This is a worst case flag, - * but its the best we can do because we don't know - * how splitting affects compression - */ - btrfs_set_file_extent_ram_bytes(leaf, extent, - ram_bytes); - btrfs_set_file_extent_type(leaf, extent, found_type); - - btrfs_unlock_up_safe(path, 1); - btrfs_mark_buffer_dirty(path->nodes[0]); - btrfs_set_lock_blocking(path->nodes[0]); - - path->leave_spinning = 0; - btrfs_release_path(root, path); - if (disk_bytenr != 0) - inode_add_bytes(inode, extent_end - end); + path->slots[0]++; + goto next_slot; } - if (found_extent && !keep) { - u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); + /* + * | ---- range to drop ----- | + * | ------ extent ------ | + */ + if (start <= key.offset && end >= extent_end) { + if (del_nr == 0) { + del_slot = path->slots[0]; + del_nr = 1; + } else { + BUG_ON(del_slot + del_nr != path->slots[0]); + del_nr++; + } - if (old_disk_bytenr != 0) { + if (extent_type == BTRFS_FILE_EXTENT_INLINE) { inode_sub_bytes(inode, - le64_to_cpu(old.num_bytes)); + extent_end - key.offset); + extent_end = ALIGN(extent_end, + root->sectorsize); + } else if (disk_bytenr > 0) { ret = btrfs_free_extent(trans, root, - old_disk_bytenr, - le64_to_cpu(old.disk_num_bytes), - 0, root->root_key.objectid, + disk_bytenr, num_bytes, 0, + root->root_key.objectid, key.objectid, key.offset - - le64_to_cpu(old.offset)); + extent_offset); BUG_ON(ret); - *hint_byte = old_disk_bytenr; + inode_sub_bytes(inode, + extent_end - key.offset); + *hint_byte = disk_bytenr; } - } - if (search_start >= end) { - ret = 0; - goto out; + if (end == extent_end) + break; + + if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) { + path->slots[0]++; + goto next_slot; + } + + ret = btrfs_del_items(trans, root, path, del_slot, + del_nr); + BUG_ON(ret); + + del_nr = 0; + del_slot = 0; + + btrfs_release_path(root, path); + continue; } + + BUG_ON(1); } -out: - btrfs_free_path(path); - if (locked_end > orig_locked_end) { - unlock_extent(&BTRFS_I(inode)->io_tree, orig_locked_end, - locked_end - 1, GFP_NOFS); + + if (del_nr > 0) { + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); } + + btrfs_free_path(path); return ret; } static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 *start, u64 *end) + u64 objectid, u64 bytenr, u64 orig_offset, + u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; @@ -598,6 +523,7 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset || btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) @@ -620,23 +546,24 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, * two or three. */ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, u64 start, u64 end) { + struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_buffer *leaf; struct btrfs_path *path; struct btrfs_file_extent_item *fi; struct btrfs_key key; + struct btrfs_key new_key; u64 bytenr; u64 num_bytes; u64 extent_end; u64 orig_offset; u64 other_start; u64 other_end; - u64 split = start; - u64 locked_end = end; - int extent_type; - int split_end = 1; + u64 split; + int del_nr = 0; + int del_slot = 0; + int recow; int ret; btrfs_drop_extent_cache(inode, start, end - 1, 0); @@ -644,12 +571,11 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); again: + recow = 0; + split = start; key.objectid = inode->i_ino; key.type = BTRFS_EXTENT_DATA_KEY; - if (split == start) - key.offset = split; - else - key.offset = split - 1; + key.offset = split; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0 && path->slots[0] > 0) @@ -661,159 +587,156 @@ again: key.type != BTRFS_EXTENT_DATA_KEY); fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - extent_type = btrfs_file_extent_type(leaf, fi); - BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_PREALLOC); extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); BUG_ON(key.offset > start || extent_end < end); bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi); + memcpy(&new_key, &key, sizeof(new_key)); - if (key.offset == start) - split = end; - - if (key.offset == start && extent_end == end) { - int del_nr = 0; - int del_slot = 0; - other_start = end; - other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - extent_end = other_end; - del_slot = path->slots[0] + 1; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - } + if (start == key.offset && end < extent_end) { other_start = 0; other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - key.offset = other_start; - del_slot = path->slots[0]; - del_nr++; - ret = btrfs_free_extent(trans, root, bytenr, num_bytes, - 0, root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - } - split_end = 0; - if (del_nr == 0) { - btrfs_set_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG); - goto done; - } - - fi = btrfs_item_ptr(leaf, del_slot - 1, - struct btrfs_file_extent_item); - btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); - btrfs_set_file_extent_num_bytes(leaf, fi, - extent_end - key.offset); - btrfs_mark_buffer_dirty(leaf); - - ret = btrfs_del_items(trans, root, path, del_slot, del_nr); - BUG_ON(ret); - goto release; - } else if (split == start) { - if (locked_end < extent_end) { - ret = try_lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, GFP_NOFS); - if (!ret) { - btrfs_release_path(root, path); - lock_extent(&BTRFS_I(inode)->io_tree, - locked_end, extent_end - 1, GFP_NOFS); - locked_end = extent_end; - goto again; - } - locked_end = extent_end; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + new_key.offset = end; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - end); + btrfs_set_file_extent_offset(leaf, fi, + end - orig_offset); + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + end - other_start); + btrfs_mark_buffer_dirty(leaf); + goto out; } - btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); - } else { - BUG_ON(key.offset != start); - key.offset = split; - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); - btrfs_set_item_key_safe(trans, root, path, &key); - extent_end = split; } - if (extent_end == end) { - split_end = 0; - extent_type = BTRFS_FILE_EXTENT_REG; - } - if (extent_end == end && split == start) { + if (start > key.offset && end == extent_end) { other_start = end; other_end = 0; - if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, - bytenr, &other_start, &other_end)) { - path->slots[0]++; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - key.offset = split; - btrfs_set_item_key_safe(trans, root, path, &key); - btrfs_set_file_extent_offset(leaf, fi, key.offset - - orig_offset); btrfs_set_file_extent_num_bytes(leaf, fi, - other_end - split); - goto done; - } - } - if (extent_end == end && split == end) { - other_start = 0; - other_end = start; - if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, - bytenr, &other_start, &other_end)) { - path->slots[0]--; + start - key.offset); + path->slots[0]++; + new_key.offset = start; + btrfs_set_item_key_safe(trans, root, path, &new_key); + fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - - other_start); - goto done; + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - start); + btrfs_set_file_extent_offset(leaf, fi, + start - orig_offset); + btrfs_mark_buffer_dirty(leaf); + goto out; } } - btrfs_mark_buffer_dirty(leaf); + while (start > key.offset || end < extent_end) { + if (key.offset == start) + split = end; - ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, - root->root_key.objectid, - inode->i_ino, orig_offset); - BUG_ON(ret); - btrfs_release_path(root, path); + new_key.offset = split; + ret = btrfs_duplicate_item(trans, root, path, &new_key); + if (ret == -EAGAIN) { + btrfs_release_path(root, path); + goto again; + } + BUG_ON(ret < 0); - key.offset = start; - ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); - BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0] - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, + split - key.offset); - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - btrfs_set_file_extent_generation(leaf, fi, trans->transid); - btrfs_set_file_extent_type(leaf, fi, extent_type); - btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); - btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_offset(leaf, fi, key.offset - orig_offset); - btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); - btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); - btrfs_set_file_extent_compression(leaf, fi, 0); - btrfs_set_file_extent_encryption(leaf, fi, 0); - btrfs_set_file_extent_other_encoding(leaf, fi, 0); -done: - btrfs_mark_buffer_dirty(leaf); - -release: - btrfs_release_path(root, path); - if (split_end && split == start) { - split = end; - goto again; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_file_extent_offset(leaf, fi, split - orig_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - split); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, + root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + + if (split == start) { + key.offset = start; + } else { + BUG_ON(start != key.offset); + path->slots[0]--; + extent_end = end; + } + recow = 1; } - if (locked_end > end) { - unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, - GFP_NOFS); + + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, + inode->i_ino, bytenr, orig_offset, + &other_start, &other_end)) { + if (recow) { + btrfs_release_path(root, path); + goto again; + } + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + 0, root->root_key.objectid, + inode->i_ino, orig_offset); + BUG_ON(ret); } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (del_nr == 0) { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_mark_buffer_dirty(leaf); + } else { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + } +out: btrfs_free_path(path); return 0; } @@ -1210,7 +1133,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) } mutex_lock(&dentry->d_inode->i_mutex); out: - return ret > 0 ? EIO : ret; + return ret > 0 ? -EIO : ret; } static const struct vm_operations_struct btrfs_file_vm_ops = { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b3ad168a0bf..4deb280f896 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -88,13 +88,14 @@ static noinline int cow_file_range(struct inode *inode, u64 start, u64 end, int *page_started, unsigned long *nr_written, int unlock); -static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) +static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { int err; - err = btrfs_init_acl(inode, dir); + err = btrfs_init_acl(trans, inode, dir); if (!err) - err = btrfs_xattr_security_init(inode, dir); + err = btrfs_xattr_security_init(trans, inode, dir); return err; } @@ -188,8 +189,18 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + /* + * we're an inline extent, so nobody can + * extend the file past i_size without locking + * a page we already have locked. + * + * We must do any isize and inode updates + * before we unlock the pages. Otherwise we + * could end up racing with unlink. + */ BTRFS_I(inode)->disk_i_size = inode->i_size; btrfs_update_inode(trans, root, inode); + return 0; fail: btrfs_free_path(path); @@ -230,8 +241,7 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans, return 1; } - ret = btrfs_drop_extents(trans, root, inode, start, - aligned_end, aligned_end, start, + ret = btrfs_drop_extents(trans, inode, start, aligned_end, &hint_byte, 1); BUG_ON(ret); @@ -416,7 +426,6 @@ again: start, end, total_compressed, pages); } - btrfs_end_transaction(trans, root); if (ret == 0) { /* * inline extent creation worked, we don't need @@ -430,9 +439,11 @@ again: EXTENT_CLEAR_DELALLOC | EXTENT_CLEAR_ACCOUNTING | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); - ret = 0; + + btrfs_end_transaction(trans, root); goto free_pages_out; } + btrfs_end_transaction(trans, root); } if (will_compress) { @@ -472,7 +483,8 @@ again: nr_pages_ret = 0; /* flag the file so we don't compress in the future */ - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; + if (!btrfs_test_opt(root, FORCE_COMPRESS)) + BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; } if (will_compress) { *num_added += 1; @@ -543,7 +555,6 @@ static noinline int submit_compressed_extents(struct inode *inode, if (list_empty(&async_cow->extents)) return 0; - trans = btrfs_join_transaction(root, 1); while (!list_empty(&async_cow->extents)) { async_extent = list_entry(async_cow->extents.next, @@ -590,19 +601,15 @@ retry: lock_extent(io_tree, async_extent->start, async_extent->start + async_extent->ram_size - 1, GFP_NOFS); - /* - * here we're doing allocation and writeback of the - * compressed pages - */ - btrfs_drop_extent_cache(inode, async_extent->start, - async_extent->start + - async_extent->ram_size - 1, 0); + trans = btrfs_join_transaction(root, 1); ret = btrfs_reserve_extent(trans, root, async_extent->compressed_size, async_extent->compressed_size, 0, alloc_hint, (u64)-1, &ins, 1); + btrfs_end_transaction(trans, root); + if (ret) { int i; for (i = 0; i < async_extent->nr_pages; i++) { @@ -618,6 +625,14 @@ retry: goto retry; } + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + em = alloc_extent_map(GFP_NOFS); em->start = async_extent->start; em->len = async_extent->ram_size; @@ -649,8 +664,6 @@ retry: BTRFS_ORDERED_COMPRESSED); BUG_ON(ret); - btrfs_end_transaction(trans, root); - /* * clear dirty, set writeback and unlock the pages. */ @@ -672,13 +685,11 @@ retry: async_extent->nr_pages); BUG_ON(ret); - trans = btrfs_join_transaction(root, 1); alloc_hint = ins.objectid + ins.offset; kfree(async_extent); cond_resched(); } - btrfs_end_transaction(trans, root); return 0; } @@ -742,6 +753,7 @@ static noinline int cow_file_range(struct inode *inode, EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK); + *nr_written = *nr_written + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; *page_started = 1; @@ -1596,7 +1608,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, struct inode *inode, u64 file_pos, u64 disk_bytenr, u64 disk_num_bytes, u64 num_bytes, u64 ram_bytes, - u64 locked_end, u8 compression, u8 encryption, u16 other_encoding, int extent_type) { @@ -1622,9 +1633,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * the caller is expected to unpin it and allow it to be merged * with the others. */ - ret = btrfs_drop_extents(trans, root, inode, file_pos, - file_pos + num_bytes, locked_end, - file_pos, &hint, 0); + ret = btrfs_drop_extents(trans, inode, file_pos, file_pos + num_bytes, + &hint, 0); BUG_ON(ret); ins.objectid = inode->i_ino; @@ -1671,24 +1681,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, * before we start the transaction. It limits the amount of btree * reads required while inside the transaction. */ -static noinline void reada_csum(struct btrfs_root *root, - struct btrfs_path *path, - struct btrfs_ordered_extent *ordered_extent) -{ - struct btrfs_ordered_sum *sum; - u64 bytenr; - - sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum, - list); - bytenr = sum->sums[0].bytenr; - - /* - * we don't care about the results, the point of this search is - * just to get the btree leaves into ram - */ - btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0); -} - /* as ordered data IO finishes, this gets called so we can finish * an ordered extent if the range of bytes in the file it covers are * fully written. @@ -1699,7 +1691,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) struct btrfs_trans_handle *trans; struct btrfs_ordered_extent *ordered_extent = NULL; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - struct btrfs_path *path; int compressed = 0; int ret; @@ -1707,46 +1698,32 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) if (!ret) return 0; - /* - * before we join the transaction, try to do some of our IO. - * This will limit the amount of IO that we have to do with - * the transaction running. We're unlikely to need to do any - * IO if the file extents are new, the disk_i_size checks - * covers the most common case. - */ - if (start < BTRFS_I(inode)->disk_i_size) { - path = btrfs_alloc_path(); - if (path) { - ret = btrfs_lookup_file_extent(NULL, root, path, - inode->i_ino, - start, 0); - ordered_extent = btrfs_lookup_ordered_extent(inode, - start); - if (!list_empty(&ordered_extent->list)) { - btrfs_release_path(root, path); - reada_csum(root, path, ordered_extent); - } - btrfs_free_path(path); + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); + + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { + BUG_ON(!list_empty(&ordered_extent->list)); + ret = btrfs_ordered_update_i_size(inode, 0, ordered_extent); + if (!ret) { + trans = btrfs_join_transaction(root, 1); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); } + goto out; } - trans = btrfs_join_transaction(root, 1); - - if (!ordered_extent) - ordered_extent = btrfs_lookup_ordered_extent(inode, start); - BUG_ON(!ordered_extent); - if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) - goto nocow; - lock_extent(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, GFP_NOFS); + trans = btrfs_join_transaction(root, 1); + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) compressed = 1; if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { BUG_ON(compressed); - ret = btrfs_mark_extent_written(trans, root, inode, + ret = btrfs_mark_extent_written(trans, inode, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len); @@ -1758,8 +1735,6 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) ordered_extent->disk_len, ordered_extent->len, ordered_extent->len, - ordered_extent->file_offset + - ordered_extent->len, compressed, 0, 0, BTRFS_FILE_EXTENT_REG); unpin_extent_cache(&BTRFS_I(inode)->extent_tree, @@ -1770,22 +1745,20 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) unlock_extent(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, GFP_NOFS); -nocow: add_pending_csums(trans, inode, ordered_extent->file_offset, &ordered_extent->list); - mutex_lock(&BTRFS_I(inode)->extent_mutex); - btrfs_ordered_update_i_size(inode, ordered_extent); - btrfs_update_inode(trans, root, inode); - btrfs_remove_ordered_extent(inode, ordered_extent); - mutex_unlock(&BTRFS_I(inode)->extent_mutex); - + /* this also removes the ordered extent from the tree */ + btrfs_ordered_update_i_size(inode, 0, ordered_extent); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + btrfs_end_transaction(trans, root); +out: /* once for us */ btrfs_put_ordered_extent(ordered_extent); /* once for the tree */ btrfs_put_ordered_extent(ordered_extent); - btrfs_end_transaction(trans, root); return 0; } @@ -2008,6 +1981,54 @@ zeroit: return -EIO; } +struct delayed_iput { + struct list_head list; + struct inode *inode; +}; + +void btrfs_add_delayed_iput(struct inode *inode) +{ + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct delayed_iput *delayed; + + if (atomic_add_unless(&inode->i_count, -1, 1)) + return; + + delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); + delayed->inode = inode; + + spin_lock(&fs_info->delayed_iput_lock); + list_add_tail(&delayed->list, &fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); +} + +void btrfs_run_delayed_iputs(struct btrfs_root *root) +{ + LIST_HEAD(list); + struct btrfs_fs_info *fs_info = root->fs_info; + struct delayed_iput *delayed; + int empty; + + spin_lock(&fs_info->delayed_iput_lock); + empty = list_empty(&fs_info->delayed_iputs); + spin_unlock(&fs_info->delayed_iput_lock); + if (empty) + return; + + down_read(&root->fs_info->cleanup_work_sem); + spin_lock(&fs_info->delayed_iput_lock); + list_splice_init(&fs_info->delayed_iputs, &list); + spin_unlock(&fs_info->delayed_iput_lock); + + while (!list_empty(&list)) { + delayed = list_entry(list.next, struct delayed_iput, list); + list_del(&delayed->list); + iput(delayed->inode); + kfree(delayed); + } + up_read(&root->fs_info->cleanup_work_sem); +} + /* * This creates an orphan entry for the given inode in case something goes * wrong in the middle of an unlink/truncate. @@ -2080,16 +2101,17 @@ void btrfs_orphan_cleanup(struct btrfs_root *root) struct inode *inode; int ret = 0, nr_unlink = 0, nr_truncate = 0; - path = btrfs_alloc_path(); - if (!path) + if (!xchg(&root->clean_orphans, 0)) return; + + path = btrfs_alloc_path(); + BUG_ON(!path); path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); key.offset = (u64)-1; - while (1) { ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -2834,37 +2856,40 @@ out: * min_type is the minimum key type to truncate down to. If set to 0, this * will kill all the items on this inode, including the INODE_ITEM_KEY. */ -noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct inode *inode, - u64 new_size, u32 min_type) +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) { - int ret; struct btrfs_path *path; - struct btrfs_key key; - struct btrfs_key found_key; - u32 found_type = (u8)-1; struct extent_buffer *leaf; struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; u64 extent_start = 0; u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; + u64 mask = root->sectorsize - 1; + u32 found_type = (u8)-1; int found_extent; int del_item; int pending_del_nr = 0; int pending_del_slot = 0; int extent_type = -1; int encoding; - u64 mask = root->sectorsize - 1; + int ret; + int err = 0; + + BUG_ON(new_size > 0 && min_type != BTRFS_EXTENT_DATA_KEY); if (root->ref_cows) btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + path = btrfs_alloc_path(); BUG_ON(!path); path->reada = -1; - /* FIXME, add redo link to tree so we don't leak on crash */ key.objectid = inode->i_ino; key.offset = (u64)-1; key.type = (u8)-1; @@ -2872,17 +2897,17 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, search_again: path->leave_spinning = 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto error; + if (ret < 0) { + err = ret; + goto out; + } if (ret > 0) { /* there are no items in the tree for us to truncate, we're * done */ - if (path->slots[0] == 0) { - ret = 0; - goto error; - } + if (path->slots[0] == 0) + goto out; path->slots[0]--; } @@ -2917,28 +2942,17 @@ search_again: } item_end--; } - if (item_end < new_size) { - if (found_type == BTRFS_DIR_ITEM_KEY) - found_type = BTRFS_INODE_ITEM_KEY; - else if (found_type == BTRFS_EXTENT_ITEM_KEY) - found_type = BTRFS_EXTENT_DATA_KEY; - else if (found_type == BTRFS_EXTENT_DATA_KEY) - found_type = BTRFS_XATTR_ITEM_KEY; - else if (found_type == BTRFS_XATTR_ITEM_KEY) - found_type = BTRFS_INODE_REF_KEY; - else if (found_type) - found_type--; - else + if (found_type > min_type) { + del_item = 1; + } else { + if (item_end < new_size) break; - btrfs_set_key_type(&key, found_type); - goto next; + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; } - if (found_key.offset >= new_size) - del_item = 1; - else - del_item = 0; found_extent = 0; - /* FIXME, shrink the extent if the ref count is only 1 */ if (found_type != BTRFS_EXTENT_DATA_KEY) goto delete; @@ -3025,42 +3039,36 @@ delete: inode->i_ino, extent_offset); BUG_ON(ret); } -next: - if (path->slots[0] == 0) { - if (pending_del_nr) - goto del_pending; - btrfs_release_path(root, path); - if (found_type == BTRFS_INODE_ITEM_KEY) - break; - goto search_again; - } - path->slots[0]--; - if (pending_del_nr && - path->slots[0] + 1 != pending_del_slot) { - struct btrfs_key debug; -del_pending: - btrfs_item_key_to_cpu(path->nodes[0], &debug, - pending_del_slot); - ret = btrfs_del_items(trans, root, path, - pending_del_slot, - pending_del_nr); - BUG_ON(ret); - pending_del_nr = 0; + if (found_type == BTRFS_INODE_ITEM_KEY) + break; + + if (path->slots[0] == 0 || + path->slots[0] != pending_del_slot) { + if (root->ref_cows) { + err = -EAGAIN; + goto out; + } + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + } btrfs_release_path(root, path); - if (found_type == BTRFS_INODE_ITEM_KEY) - break; goto search_again; + } else { + path->slots[0]--; } } - ret = 0; -error: +out: if (pending_del_nr) { ret = btrfs_del_items(trans, root, path, pending_del_slot, pending_del_nr); } btrfs_free_path(path); - return ret; + return err; } /* @@ -3180,10 +3188,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - err = btrfs_truncate_page(inode->i_mapping, inode->i_size); - if (err) - return err; - while (1) { struct btrfs_ordered_extent *ordered; btrfs_wait_ordered_range(inode, hole_start, @@ -3196,9 +3200,6 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) btrfs_put_ordered_extent(ordered); } - trans = btrfs_start_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - cur_offset = hole_start; while (1) { em = btrfs_get_extent(inode, NULL, 0, cur_offset, @@ -3206,40 +3207,120 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) BUG_ON(IS_ERR(em) || !em); last_byte = min(extent_map_end(em), block_end); last_byte = (last_byte + mask) & ~mask; - if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { u64 hint_byte = 0; hole_size = last_byte - cur_offset; - err = btrfs_drop_extents(trans, root, inode, - cur_offset, - cur_offset + hole_size, - block_end, - cur_offset, &hint_byte, 1); - if (err) - break; - err = btrfs_reserve_metadata_space(root, 1); + err = btrfs_reserve_metadata_space(root, 2); if (err) break; + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + err = btrfs_drop_extents(trans, inode, cur_offset, + cur_offset + hole_size, + &hint_byte, 1); + BUG_ON(err); + err = btrfs_insert_file_extent(trans, root, inode->i_ino, cur_offset, 0, 0, hole_size, 0, hole_size, 0, 0, 0); + BUG_ON(err); + btrfs_drop_extent_cache(inode, hole_start, last_byte - 1, 0); - btrfs_unreserve_metadata_space(root, 1); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 2); } free_extent_map(em); cur_offset = last_byte; - if (err || cur_offset >= block_end) + if (cur_offset >= block_end) break; } - btrfs_end_transaction(trans, root); unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); return err; } +static int btrfs_setattr_size(struct inode *inode, struct iattr *attr) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + unsigned long nr; + int ret; + + if (attr->ia_size == inode->i_size) + return 0; + + if (attr->ia_size > inode->i_size) { + unsigned long limit; + limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + if (attr->ia_size > inode->i_sb->s_maxbytes) + return -EFBIG; + if (limit != RLIM_INFINITY && attr->ia_size > limit) { + send_sig(SIGXFSZ, current, 0); + return -EFBIG; + } + } + + ret = btrfs_reserve_metadata_space(root, 1); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_orphan_add(trans, inode); + BUG_ON(ret); + + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 1); + btrfs_btree_balance_dirty(root, nr); + + if (attr->ia_size > inode->i_size) { + ret = btrfs_cont_expand(inode, attr->ia_size); + if (ret) { + btrfs_truncate(inode); + return ret; + } + + i_size_write(inode, attr->ia_size); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + if (inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return 0; + } + + /* + * We're truncating a file that used to have good data down to + * zero. Make sure it gets into the ordered flush list so that + * any new writes get down to disk quickly. + */ + if (attr->ia_size == 0) + BTRFS_I(inode)->ordered_data_close = 1; + + /* we don't support swapfiles, so vmtruncate shouldn't fail */ + ret = vmtruncate(inode, attr->ia_size); + BUG_ON(ret); + + return 0; +} + static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; @@ -3250,23 +3331,14 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size > inode->i_size) { - err = btrfs_cont_expand(inode, attr->ia_size); - if (err) - return err; - } else if (inode->i_size > 0 && - attr->ia_size == 0) { - - /* we're truncating a file that used to have good - * data down to zero. Make sure it gets into - * the ordered flush list so that any new writes - * get down to disk quickly. - */ - BTRFS_I(inode)->ordered_data_close = 1; - } + err = btrfs_setattr_size(inode, attr); + if (err) + return err; } + attr->ia_valid &= ~ATTR_SIZE; - err = inode_setattr(inode, attr); + if (attr->ia_valid) + err = inode_setattr(inode, attr); if (!err && ((attr->ia_valid & ATTR_MODE))) err = btrfs_acl_chmod(inode); @@ -3287,36 +3359,43 @@ void btrfs_delete_inode(struct inode *inode) } btrfs_wait_ordered_range(inode, 0, (u64)-1); + if (root->fs_info->log_root_recovering) { + BUG_ON(!list_empty(&BTRFS_I(inode)->i_orphan)); + goto no_delete; + } + if (inode->i_nlink > 0) { BUG_ON(btrfs_root_refs(&root->root_item) != 0); goto no_delete; } btrfs_i_size_write(inode, 0); - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); - if (ret) { - btrfs_orphan_del(NULL, inode); - goto no_delete_lock; - } + while (1) { + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, 0, 0); - btrfs_orphan_del(trans, inode); + if (ret != -EAGAIN) + break; - nr = trans->blocks_used; - clear_inode(inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + trans = NULL; + btrfs_btree_balance_dirty(root, nr); + } - btrfs_end_transaction(trans, root); - btrfs_btree_balance_dirty(root, nr); - return; + if (ret == 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } -no_delete_lock: nr = trans->blocks_used; btrfs_end_transaction(trans, root); btrfs_btree_balance_dirty(root, nr); no_delete: clear_inode(inode); + return; } /* @@ -3569,7 +3648,6 @@ static noinline void init_btrfs_i(struct inode *inode) INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); - mutex_init(&BTRFS_I(inode)->extent_mutex); mutex_init(&BTRFS_I(inode)->log_mutex); } @@ -3695,6 +3773,13 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) } srcu_read_unlock(&root->fs_info->subvol_srcu, index); + if (root != sub_root) { + down_read(&root->fs_info->cleanup_work_sem); + if (!(inode->i_sb->s_flags & MS_RDONLY)) + btrfs_orphan_cleanup(sub_root); + up_read(&root->fs_info->cleanup_work_sem); + } + return inode; } @@ -3869,7 +3954,11 @@ skip: /* Reached end of directory/root. Bump pos past the last item. */ if (key_type == BTRFS_DIR_INDEX_KEY) - filp->f_pos = INT_LIMIT(off_t); + /* + * 32-bit glibc will use getdents64, but then strtol - + * so the last number we can serve is this. + */ + filp->f_pos = 0x7fffffff; else filp->f_pos++; nopos: @@ -4219,7 +4308,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4290,7 +4379,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -4336,6 +4425,10 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, if (inode->i_nlink == 0) return -ENOENT; + /* do not allow sys_link's with other subvols of the same device */ + if (root->objectid != BTRFS_I(inode)->root->objectid) + return -EPERM; + /* * 1 item for inode ref * 2 items for dir items @@ -4423,7 +4516,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) drop_on_err = 1; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) goto out_fail; @@ -5074,17 +5167,20 @@ static void btrfs_truncate(struct inode *inode) unsigned long nr; u64 mask = root->sectorsize - 1; - if (!S_ISREG(inode->i_mode)) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + if (!S_ISREG(inode->i_mode)) { + WARN_ON(1); return; + } ret = btrfs_truncate_page(inode->i_mapping, inode->i_size); if (ret) return; + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + btrfs_ordered_update_i_size(inode, inode->i_size, NULL); trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); /* * setattr is responsible for setting the ordered_data_close flag, @@ -5106,21 +5202,32 @@ static void btrfs_truncate(struct inode *inode) if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close) btrfs_add_ordered_operation(trans, root, inode); - btrfs_set_trans_block_group(trans, inode); - btrfs_i_size_write(inode, inode->i_size); + while (1) { + ret = btrfs_truncate_inode_items(trans, root, inode, + inode->i_size, + BTRFS_EXTENT_DATA_KEY); + if (ret != -EAGAIN) + break; - ret = btrfs_orphan_add(trans, inode); - if (ret) - goto out; - /* FIXME, add redo link to tree so we don't leak on crash */ - ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, - BTRFS_EXTENT_DATA_KEY); - btrfs_update_inode(trans, root, inode); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); - ret = btrfs_orphan_del(trans, inode); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + } + + if (ret == 0 && inode->i_nlink > 0) { + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + } + + ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); -out: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); BUG_ON(ret); @@ -5217,9 +5324,9 @@ void btrfs_destroy_inode(struct inode *inode) spin_lock(&root->list_lock); if (!list_empty(&BTRFS_I(inode)->i_orphan)) { - printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" - " list\n", inode->i_ino); - dump_stack(); + printk(KERN_INFO "BTRFS: inode %lu still on the orphan list\n", + inode->i_ino); + list_del_init(&BTRFS_I(inode)->i_orphan); } spin_unlock(&root->list_lock); @@ -5476,7 +5583,7 @@ out_fail: * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -int btrfs_start_delalloc_inodes(struct btrfs_root *root) +int btrfs_start_delalloc_inodes(struct btrfs_root *root, int delay_iput) { struct list_head *head = &root->fs_info->delalloc_inodes; struct btrfs_inode *binode; @@ -5495,7 +5602,10 @@ int btrfs_start_delalloc_inodes(struct btrfs_root *root) spin_unlock(&root->fs_info->delalloc_lock); if (inode) { filemap_flush(inode->i_mapping); - iput(inode); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); } cond_resched(); spin_lock(&root->fs_info->delalloc_lock); @@ -5569,7 +5679,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) goto out_unlock; - err = btrfs_init_inode_security(inode, dir); + err = btrfs_init_inode_security(trans, inode, dir); if (err) { drop_inode = 1; goto out_unlock; @@ -5641,57 +5751,77 @@ out_fail: return err; } -static int prealloc_file_range(struct btrfs_trans_handle *trans, - struct inode *inode, u64 start, u64 end, - u64 locked_end, u64 alloc_hint, int mode) +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode, loff_t actual_len) { + struct btrfs_trans_handle *trans; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_key ins; u64 alloc_size; u64 cur_offset = start; u64 num_bytes = end - start; int ret = 0; + u64 i_size; while (num_bytes > 0) { alloc_size = min(num_bytes, root->fs_info->max_extent); - ret = btrfs_reserve_metadata_space(root, 1); - if (ret) - goto out; + trans = btrfs_start_transaction(root, 1); ret = btrfs_reserve_extent(trans, root, alloc_size, root->sectorsize, 0, alloc_hint, (u64)-1, &ins, 1); if (ret) { WARN_ON(1); - goto out; + goto stop_trans; } + + ret = btrfs_reserve_metadata_space(root, 3); + if (ret) { + btrfs_free_reserved_extent(root, ins.objectid, + ins.offset); + goto stop_trans; + } + ret = insert_reserved_file_extent(trans, inode, cur_offset, ins.objectid, ins.offset, ins.offset, - ins.offset, locked_end, - 0, 0, 0, + ins.offset, 0, 0, 0, BTRFS_FILE_EXTENT_PREALLOC); BUG_ON(ret); btrfs_drop_extent_cache(inode, cur_offset, cur_offset + ins.offset -1, 0); + num_bytes -= ins.offset; cur_offset += ins.offset; alloc_hint = ins.objectid + ins.offset; - btrfs_unreserve_metadata_space(root, 1); - } -out: - if (cur_offset > start) { + inode->i_ctime = CURRENT_TIME; BTRFS_I(inode)->flags |= BTRFS_INODE_PREALLOC; if (!(mode & FALLOC_FL_KEEP_SIZE) && - cur_offset > i_size_read(inode)) - btrfs_i_size_write(inode, cur_offset); + (actual_len > inode->i_size) && + (cur_offset > inode->i_size)) { + + if (cur_offset > actual_len) + i_size = actual_len; + else + i_size = cur_offset; + i_size_write(inode, i_size); + btrfs_ordered_update_i_size(inode, i_size, NULL); + } + ret = btrfs_update_inode(trans, root, inode); BUG_ON(ret); + + btrfs_end_transaction(trans, root); + btrfs_unreserve_metadata_space(root, 3); } + return ret; +stop_trans: + btrfs_end_transaction(trans, root); return ret; + } static long btrfs_fallocate(struct inode *inode, int mode, @@ -5705,8 +5835,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, u64 locked_end; u64 mask = BTRFS_I(inode)->root->sectorsize - 1; struct extent_map *em; - struct btrfs_trans_handle *trans; - struct btrfs_root *root; int ret; alloc_start = offset & ~mask; @@ -5725,9 +5853,7 @@ static long btrfs_fallocate(struct inode *inode, int mode, goto out; } - root = BTRFS_I(inode)->root; - - ret = btrfs_check_data_free_space(root, inode, + ret = btrfs_check_data_free_space(BTRFS_I(inode)->root, inode, alloc_end - alloc_start); if (ret) goto out; @@ -5736,12 +5862,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, while (1) { struct btrfs_ordered_extent *ordered; - trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1); - if (!trans) { - ret = -EIO; - goto out_free; - } - /* the extent lock is ordered inside the running * transaction */ @@ -5755,8 +5875,6 @@ static long btrfs_fallocate(struct inode *inode, int mode, btrfs_put_ordered_extent(ordered); unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); - btrfs_end_transaction(trans, BTRFS_I(inode)->root); - /* * we can't wait on the range with the transaction * running or with the extent lock held @@ -5777,10 +5895,12 @@ static long btrfs_fallocate(struct inode *inode, int mode, BUG_ON(IS_ERR(em) || !em); last_byte = min(extent_map_end(em), alloc_end); last_byte = (last_byte + mask) & ~mask; - if (em->block_start == EXTENT_MAP_HOLE) { - ret = prealloc_file_range(trans, inode, cur_offset, - last_byte, locked_end + 1, - alloc_hint, mode); + if (em->block_start == EXTENT_MAP_HOLE || + (cur_offset >= inode->i_size && + !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + ret = prealloc_file_range(inode, + cur_offset, last_byte, + alloc_hint, mode, offset+len); if (ret < 0) { free_extent_map(em); break; @@ -5799,9 +5919,8 @@ static long btrfs_fallocate(struct inode *inode, int mode, unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, GFP_NOFS); - btrfs_end_transaction(trans, BTRFS_I(inode)->root); -out_free: - btrfs_free_reserved_data_space(root, inode, alloc_end - alloc_start); + btrfs_free_reserved_data_space(BTRFS_I(inode)->root, inode, + alloc_end - alloc_start); out: mutex_unlock(&inode->i_mutex); return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index cdbb054102b..645a17927a8 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -237,7 +237,6 @@ static noinline int create_subvol(struct btrfs_root *root, u64 objectid; u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; u64 index = 0; - unsigned long nr = 1; /* * 1 - inode item @@ -290,7 +289,7 @@ static noinline int create_subvol(struct btrfs_root *root, btrfs_set_root_generation(&root_item, trans->transid); btrfs_set_root_level(&root_item, 0); btrfs_set_root_refs(&root_item, 1); - btrfs_set_root_used(&root_item, 0); + btrfs_set_root_used(&root_item, leaf->len); btrfs_set_root_last_snapshot(&root_item, 0); memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); @@ -342,24 +341,21 @@ static noinline int create_subvol(struct btrfs_root *root, d_instantiate(dentry, btrfs_lookup_dentry(dir, dentry)); fail: - nr = trans->blocks_used; err = btrfs_commit_transaction(trans, root); if (err && !ret) ret = err; btrfs_unreserve_metadata_space(root, 6); - btrfs_btree_balance_dirty(root, nr); return ret; } static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, char *name, int namelen) { + struct inode *inode; struct btrfs_pending_snapshot *pending_snapshot; struct btrfs_trans_handle *trans; - int ret = 0; - int err; - unsigned long nr = 0; + int ret; if (!root->ref_cows) return -EINVAL; @@ -372,20 +368,20 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, */ ret = btrfs_reserve_metadata_space(root, 6); if (ret) - goto fail_unlock; + goto fail; pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); if (!pending_snapshot) { ret = -ENOMEM; btrfs_unreserve_metadata_space(root, 6); - goto fail_unlock; + goto fail; } pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); if (!pending_snapshot->name) { ret = -ENOMEM; kfree(pending_snapshot); btrfs_unreserve_metadata_space(root, 6); - goto fail_unlock; + goto fail; } memcpy(pending_snapshot->name, name, namelen); pending_snapshot->name[namelen] = '\0'; @@ -395,10 +391,19 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, pending_snapshot->root = root; list_add(&pending_snapshot->list, &trans->transaction->pending_snapshots); - err = btrfs_commit_transaction(trans, root); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + btrfs_unreserve_metadata_space(root, 6); -fail_unlock: - btrfs_btree_balance_dirty(root, nr); + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); + if (IS_ERR(inode)) { + ret = PTR_ERR(inode); + goto fail; + } + BUG_ON(!inode); + d_instantiate(dentry, inode); + ret = 0; +fail: return ret; } @@ -1027,8 +1032,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, BUG_ON(!trans); /* punch hole in destination first */ - btrfs_drop_extents(trans, root, inode, off, off + len, - off + len, 0, &hint_byte, 1); + btrfs_drop_extents(trans, inode, off, off + len, &hint_byte, 1); /* clone data */ key.objectid = src->i_ino; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5799bc46a30..5c2a9e78a94 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -291,16 +291,16 @@ int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) /* * remove an ordered extent from the tree. No references are dropped - * but, anyone waiting on this extent is woken up. + * and you must wake_up entry->wait. You must hold the tree mutex + * while you call this function. */ -int btrfs_remove_ordered_extent(struct inode *inode, +static int __btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; tree = &BTRFS_I(inode)->ordered_tree; - mutex_lock(&tree->mutex); node = &entry->rb_node; rb_erase(node, &tree->tree); tree->last = NULL; @@ -326,16 +326,34 @@ int btrfs_remove_ordered_extent(struct inode *inode, } spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but any waiters are woken. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + ret = __btrfs_remove_ordered_extent(inode, entry); mutex_unlock(&tree->mutex); wake_up(&entry->wait); - return 0; + + return ret; } /* * wait for all the ordered extents in a root. This is done when balancing * space between drives. */ -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput) { struct list_head splice; struct list_head *cur; @@ -372,7 +390,10 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) if (inode) { btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); - iput(inode); + if (delay_iput) + btrfs_add_delayed_iput(inode); + else + iput(inode); } else { btrfs_put_ordered_extent(ordered); } @@ -430,7 +451,7 @@ again: btrfs_wait_ordered_range(inode, 0, (u64)-1); else filemap_flush(inode->i_mapping); - iput(inode); + btrfs_add_delayed_iput(inode); } cond_resched(); @@ -589,7 +610,7 @@ out: * After an extent is done, call this to conditionally update the on disk * i_size. i_size is updated to cover any fully written part of the file. */ -int btrfs_ordered_update_i_size(struct inode *inode, +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered) { struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; @@ -597,18 +618,32 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 disk_i_size; u64 new_i_size; u64 i_size_test; + u64 i_size = i_size_read(inode); struct rb_node *node; + struct rb_node *prev = NULL; struct btrfs_ordered_extent *test; + int ret = 1; + + if (ordered) + offset = entry_end(ordered); + else + offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize); mutex_lock(&tree->mutex); disk_i_size = BTRFS_I(inode)->disk_i_size; + /* truncate file */ + if (disk_i_size > i_size) { + BTRFS_I(inode)->disk_i_size = i_size; + ret = 0; + goto out; + } + /* * if the disk i_size is already at the inode->i_size, or * this ordered extent is inside the disk i_size, we're done */ - if (disk_i_size >= inode->i_size || - ordered->file_offset + ordered->len <= disk_i_size) { + if (disk_i_size == i_size || offset <= disk_i_size) { goto out; } @@ -616,8 +651,7 @@ int btrfs_ordered_update_i_size(struct inode *inode, * we can't update the disk_isize if there are delalloc bytes * between disk_i_size and this ordered extent */ - if (test_range_bit(io_tree, disk_i_size, - ordered->file_offset + ordered->len - 1, + if (test_range_bit(io_tree, disk_i_size, offset - 1, EXTENT_DELALLOC, 0, NULL)) { goto out; } @@ -626,20 +660,32 @@ int btrfs_ordered_update_i_size(struct inode *inode, * if we find an ordered extent then we can't update disk i_size * yet */ - node = &ordered->rb_node; - while (1) { - node = rb_prev(node); - if (!node) - break; + if (ordered) { + node = rb_prev(&ordered->rb_node); + } else { + prev = tree_search(tree, offset); + /* + * we insert file extents without involving ordered struct, + * so there should be no ordered struct cover this offset + */ + if (prev) { + test = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + BUG_ON(offset_in_entry(test, offset)); + } + node = prev; + } + while (node) { test = rb_entry(node, struct btrfs_ordered_extent, rb_node); if (test->file_offset + test->len <= disk_i_size) break; - if (test->file_offset >= inode->i_size) + if (test->file_offset >= i_size) break; if (test->file_offset >= disk_i_size) goto out; + node = rb_prev(node); } - new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + new_i_size = min_t(u64, offset, i_size); /* * at this point, we know we can safely update i_size to at least @@ -647,7 +693,14 @@ int btrfs_ordered_update_i_size(struct inode *inode, * walk forward and see if ios from higher up in the file have * finished. */ - node = rb_next(&ordered->rb_node); + if (ordered) { + node = rb_next(&ordered->rb_node); + } else { + if (prev) + node = rb_next(prev); + else + node = rb_first(&tree->tree); + } i_size_test = 0; if (node) { /* @@ -655,10 +708,10 @@ int btrfs_ordered_update_i_size(struct inode *inode, * between our ordered extent and the next one. */ test = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (test->file_offset > entry_end(ordered)) + if (test->file_offset > offset) i_size_test = test->file_offset; } else { - i_size_test = i_size_read(inode); + i_size_test = i_size; } /* @@ -667,15 +720,25 @@ int btrfs_ordered_update_i_size(struct inode *inode, * are no delalloc bytes in this area, it is safe to update * disk_i_size to the end of the region. */ - if (i_size_test > entry_end(ordered) && - !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, - EXTENT_DELALLOC, 0, NULL)) { - new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + if (i_size_test > offset && + !test_range_bit(io_tree, offset, i_size_test - 1, + EXTENT_DELALLOC, 0, NULL)) { + new_i_size = min_t(u64, i_size_test, i_size); } BTRFS_I(inode)->disk_i_size = new_i_size; + ret = 0; out: + /* + * we need to remove the ordered extent with the tree lock held + * so that other people calling this function don't find our fully + * processed ordered entry and skip updating the i_size + */ + if (ordered) + __btrfs_remove_ordered_extent(inode, ordered); mutex_unlock(&tree->mutex); - return 0; + if (ordered) + wake_up(&ordered->wait); + return ret; } /* diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index f82e87488ca..1fe1282ef47 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -150,12 +150,13 @@ void btrfs_start_ordered_extent(struct inode *inode, int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); -int btrfs_ordered_update_i_size(struct inode *inode, +int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); -int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); int btrfs_run_ordered_operations(struct btrfs_root *root, int wait); int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, + int nocow_only, int delay_iput); #endif diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index cfcc93c93a7..ab7ab531874 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1561,6 +1561,20 @@ static int invalidate_extent_cache(struct btrfs_root *root, return 0; } +static void put_inodes(struct list_head *list) +{ + struct inodevec *ivec; + while (!list_empty(list)) { + ivec = list_entry(list->next, struct inodevec, list); + list_del(&ivec->list); + while (ivec->nr > 0) { + ivec->nr--; + iput(ivec->inode[ivec->nr]); + } + kfree(ivec); + } +} + static int find_next_key(struct btrfs_path *path, int level, struct btrfs_key *key) @@ -1723,6 +1737,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, btrfs_btree_balance_dirty(root, nr); + /* + * put inodes outside transaction, otherwise we may deadlock. + */ + put_inodes(&inode_list); + if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); } @@ -1752,19 +1771,7 @@ out: btrfs_btree_balance_dirty(root, nr); - /* - * put inodes while we aren't holding the tree locks - */ - while (!list_empty(&inode_list)) { - struct inodevec *ivec; - ivec = list_entry(inode_list.next, struct inodevec, list); - list_del(&ivec->list); - while (ivec->nr > 0) { - ivec->nr--; - iput(ivec->inode[ivec->nr]); - } - kfree(ivec); - } + put_inodes(&inode_list); if (replaced && rc->stage == UPDATE_DATA_PTRS) invalidate_extent_cache(root, &key, &next_key); @@ -3274,8 +3281,10 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) return -ENOMEM; path = btrfs_alloc_path(); - if (!path) + if (!path) { + kfree(cluster); return -ENOMEM; + } rc->extents_found = 0; rc->extents_skipped = 0; @@ -3534,8 +3543,8 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start) (unsigned long long)rc->block_group->key.objectid, (unsigned long long)rc->block_group->flags); - btrfs_start_delalloc_inodes(fs_info->tree_root); - btrfs_wait_ordered_extents(fs_info->tree_root, 0); + btrfs_start_delalloc_inodes(fs_info->tree_root, 0); + btrfs_wait_ordered_extents(fs_info->tree_root, 0, 0); while (1) { rc->extents_found = 0; @@ -3755,6 +3764,8 @@ out: BTRFS_DATA_RELOC_TREE_OBJECTID); if (IS_ERR(fs_root)) err = PTR_ERR(fs_root); + else + btrfs_orphan_cleanup(fs_root); } return err; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 752a5463bf5..8a1ea6e6457 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -66,7 +66,8 @@ enum { Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, Opt_ssd, Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, - Opt_compress, Opt_notreelog, Opt_ratio, Opt_flushoncommit, + Opt_compress, Opt_compress_force, Opt_notreelog, Opt_ratio, + Opt_flushoncommit, Opt_discard, Opt_err, }; @@ -82,6 +83,7 @@ static match_table_t tokens = { {Opt_alloc_start, "alloc_start=%s"}, {Opt_thread_pool, "thread_pool=%d"}, {Opt_compress, "compress"}, + {Opt_compress_force, "compress-force"}, {Opt_ssd, "ssd"}, {Opt_ssd_spread, "ssd_spread"}, {Opt_nossd, "nossd"}, @@ -128,6 +130,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *num; int intarg; + int ret = 0; if (!options) return 0; @@ -172,6 +175,11 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) printk(KERN_INFO "btrfs: use compression\n"); btrfs_set_opt(info->mount_opt, COMPRESS); break; + case Opt_compress_force: + printk(KERN_INFO "btrfs: forcing compression\n"); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; case Opt_ssd: printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); btrfs_set_opt(info->mount_opt, SSD); @@ -262,12 +270,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) case Opt_discard: btrfs_set_opt(info->mount_opt, DISCARD); break; + case Opt_err: + printk(KERN_INFO "btrfs: unrecognized mount option " + "'%s'\n", p); + ret = -EINVAL; + goto out; default: break; } } +out: kfree(options); - return 0; + return ret; } /* @@ -405,8 +419,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait) return 0; } - btrfs_start_delalloc_inodes(root); - btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 0); + btrfs_wait_ordered_extents(root, 0, 0); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); @@ -450,6 +464,8 @@ static int btrfs_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",notreelog"); if (btrfs_test_opt(root, FLUSHONCOMMIT)) seq_puts(seq, ",flushoncommit"); + if (btrfs_test_opt(root, DISCARD)) + seq_puts(seq, ",discard"); if (!(root->fs_info->sb->s_flags & MS_POSIXACL)) seq_puts(seq, ",noacl"); return 0; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index c207e8c32c9..b2acc79f1b3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -333,6 +333,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, memset(trans, 0, sizeof(*trans)); kmem_cache_free(btrfs_trans_handle_cachep, trans); + if (throttle) + btrfs_run_delayed_iputs(root); + return 0; } @@ -354,7 +357,7 @@ int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, * those extents are sent to disk but does not wait on them */ int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; @@ -367,7 +370,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, while (1) { ret = find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_DIRTY); + mark); if (ret) break; while (start <= end) { @@ -413,7 +416,7 @@ int btrfs_write_marked_extents(struct btrfs_root *root, * on all the pages and clear them from the dirty pages state tree */ int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int err = 0; @@ -425,12 +428,12 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, unsigned long index; while (1) { - ret = find_first_extent_bit(dirty_pages, 0, &start, &end, - EXTENT_DIRTY); + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + mark); if (ret) break; - clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { index = start >> PAGE_CACHE_SHIFT; start = (u64)(index + 1) << PAGE_CACHE_SHIFT; @@ -460,13 +463,13 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, * those extents are on disk for transaction or log commit */ int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages) + struct extent_io_tree *dirty_pages, int mark) { int ret; int ret2; - ret = btrfs_write_marked_extents(root, dirty_pages); - ret2 = btrfs_wait_marked_extents(root, dirty_pages); + ret = btrfs_write_marked_extents(root, dirty_pages, mark); + ret2 = btrfs_wait_marked_extents(root, dirty_pages, mark); return ret || ret2; } @@ -479,7 +482,8 @@ int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, return filemap_write_and_wait(btree_inode->i_mapping); } return btrfs_write_and_wait_marked_extents(root, - &trans->transaction->dirty_pages); + &trans->transaction->dirty_pages, + EXTENT_DIRTY); } /* @@ -497,13 +501,16 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, { int ret; u64 old_root_bytenr; + u64 old_root_used; struct btrfs_root *tree_root = root->fs_info->tree_root; + old_root_used = btrfs_root_used(&root->root_item); btrfs_write_dirty_block_groups(trans, root); while (1) { old_root_bytenr = btrfs_root_bytenr(&root->root_item); - if (old_root_bytenr == root->node->start) + if (old_root_bytenr == root->node->start && + old_root_used == btrfs_root_used(&root->root_item)) break; btrfs_set_root_node(&root->root_item, root->node); @@ -512,6 +519,7 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans, &root->root_item); BUG_ON(ret); + old_root_used = btrfs_root_used(&root->root_item); ret = btrfs_write_dirty_block_groups(trans, root); BUG_ON(ret); } @@ -795,7 +803,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, memcpy(&pending->root_key, &key, sizeof(key)); fail: kfree(new_root_item); - btrfs_unreserve_metadata_space(root, 6); return ret; } @@ -807,7 +814,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, u64 index = 0; struct btrfs_trans_handle *trans; struct inode *parent_inode; - struct inode *inode; struct btrfs_root *parent_root; parent_inode = pending->dentry->d_parent->d_inode; @@ -839,8 +845,6 @@ static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, BUG_ON(ret); - inode = btrfs_lookup_dentry(parent_inode, pending->dentry); - d_instantiate(pending->dentry, inode); fail: btrfs_end_transaction(trans, fs_info->fs_root); return ret; @@ -994,11 +998,11 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, mutex_unlock(&root->fs_info->trans_mutex); if (flush_on_commit) { - btrfs_start_delalloc_inodes(root); - ret = btrfs_wait_ordered_extents(root, 0); + btrfs_start_delalloc_inodes(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } else if (snap_pending) { - ret = btrfs_wait_ordered_extents(root, 1); + ret = btrfs_wait_ordered_extents(root, 0, 1); BUG_ON(ret); } @@ -1116,6 +1120,10 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, current->journal_info = NULL; kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (current != root->fs_info->transaction_kthread) + btrfs_run_delayed_iputs(root); + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d4e3e7a6938..93c7ccb3311 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -107,10 +107,10 @@ void btrfs_throttle(struct btrfs_root *root); int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_write_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_wait_marked_extents(struct btrfs_root *root, - struct extent_io_tree *dirty_pages); + struct extent_io_tree *dirty_pages, int mark); int btrfs_transaction_in_commit(struct btrfs_fs_info *info); #endif diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 741666a7676..4a9434b622e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -542,8 +542,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, saved_nbytes = inode_get_bytes(inode); /* drop any overlapping extents */ - ret = btrfs_drop_extents(trans, root, inode, - start, extent_end, extent_end, start, &alloc_hint, 1); + ret = btrfs_drop_extents(trans, inode, start, extent_end, + &alloc_hint, 1); BUG_ON(ret); if (found_type == BTRFS_FILE_EXTENT_REG || @@ -930,6 +930,17 @@ out_nowrite: return 0; } +static int insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + int ret; + ret = btrfs_find_orphan_item(root, offset); + if (ret > 0) + ret = btrfs_insert_orphan_item(trans, root, offset); + return ret; +} + + /* * There are a few corners where the link count of the file can't * be properly maintained during replay. So, instead of adding @@ -997,9 +1008,13 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, } BTRFS_I(inode)->index_cnt = (u64)-1; - if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) { - ret = replay_dir_deletes(trans, root, NULL, path, - inode->i_ino, 1); + if (inode->i_nlink == 0) { + if (S_ISDIR(inode->i_mode)) { + ret = replay_dir_deletes(trans, root, NULL, path, + inode->i_ino, 1); + BUG_ON(ret); + } + ret = insert_orphan_item(trans, root, inode->i_ino); BUG_ON(ret); } btrfs_free_path(path); @@ -1587,7 +1602,6 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, /* inode keys are done during the first stage */ if (key.type == BTRFS_INODE_ITEM_KEY && wc->stage == LOG_WALK_REPLAY_INODES) { - struct inode *inode; struct btrfs_inode_item *inode_item; u32 mode; @@ -1603,31 +1617,16 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, eb, i, &key); BUG_ON(ret); - /* for regular files, truncate away - * extents past the new EOF + /* for regular files, make sure corresponding + * orhpan item exist. extents past the new EOF + * will be truncated later by orphan cleanup. */ if (S_ISREG(mode)) { - inode = read_one_inode(root, - key.objectid); - BUG_ON(!inode); - - ret = btrfs_truncate_inode_items(wc->trans, - root, inode, inode->i_size, - BTRFS_EXTENT_DATA_KEY); + ret = insert_orphan_item(wc->trans, root, + key.objectid); BUG_ON(ret); - - /* if the nlink count is zero here, the iput - * will free the inode. We bump it to make - * sure it doesn't get freed until the link - * count fixup is done - */ - if (inode->i_nlink == 0) { - btrfs_inc_nlink(inode); - btrfs_update_inode(wc->trans, - root, inode); - } - iput(inode); } + ret = link_to_fixup_dir(wc->trans, root, path, key.objectid); BUG_ON(ret); @@ -1977,10 +1976,11 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, { int index1; int index2; + int mark; int ret; struct btrfs_root *log = root->log_root; struct btrfs_root *log_root_tree = root->fs_info->log_root_tree; - u64 log_transid = 0; + unsigned long log_transid = 0; mutex_lock(&root->log_mutex); index1 = root->log_transid % 2; @@ -2014,24 +2014,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, goto out; } + log_transid = root->log_transid; + if (log_transid % 2 == 0) + mark = EXTENT_DIRTY; + else + mark = EXTENT_NEW; + /* we start IO on all the marked extents here, but we don't actually * wait for them until later. */ - ret = btrfs_write_marked_extents(log, &log->dirty_log_pages); + ret = btrfs_write_marked_extents(log, &log->dirty_log_pages, mark); BUG_ON(ret); btrfs_set_root_node(&log->root_item, log->node); root->log_batch = 0; - log_transid = root->log_transid; root->log_transid++; log->log_transid = root->log_transid; root->log_start_pid = 0; smp_mb(); /* - * log tree has been flushed to disk, new modifications of - * the log will be written to new positions. so it's safe to - * allow log writers to go in. + * IO has been started, blocks of the log tree have WRITTEN flag set + * in their headers. new modifications of the log will be written to + * new positions. so it's safe to allow log writers to go in. */ mutex_unlock(&root->log_mutex); @@ -2052,7 +2057,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, index2 = log_root_tree->log_transid % 2; if (atomic_read(&log_root_tree->log_commit[index2])) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); wait_log_commit(trans, log_root_tree, log_root_tree->log_transid); mutex_unlock(&log_root_tree->log_mutex); @@ -2072,16 +2077,17 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, * check the full commit flag again */ if (root->fs_info->last_trans_log_full_commit == trans->transid) { - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); mutex_unlock(&log_root_tree->log_mutex); ret = -EAGAIN; goto out_wake_log_root; } ret = btrfs_write_and_wait_marked_extents(log_root_tree, - &log_root_tree->dirty_log_pages); + &log_root_tree->dirty_log_pages, + EXTENT_DIRTY | EXTENT_NEW); BUG_ON(ret); - btrfs_wait_marked_extents(log, &log->dirty_log_pages); + btrfs_wait_marked_extents(log, &log->dirty_log_pages, mark); btrfs_set_super_log_root(&root->fs_info->super_for_commit, log_root_tree->node->start); @@ -2147,12 +2153,12 @@ int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) while (1) { ret = find_first_extent_bit(&log->dirty_log_pages, - 0, &start, &end, EXTENT_DIRTY); + 0, &start, &end, EXTENT_DIRTY | EXTENT_NEW); if (ret) break; - clear_extent_dirty(&log->dirty_log_pages, - start, end, GFP_NOFS); + clear_extent_bits(&log->dirty_log_pages, start, end, + EXTENT_DIRTY | EXTENT_NEW, GFP_NOFS); } if (log->log_transid > 0) { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7eda483d7b5..41ecbb2347f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1135,7 +1135,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) root->fs_info->avail_metadata_alloc_bits; if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && - root->fs_info->fs_devices->rw_devices <= 4) { + root->fs_info->fs_devices->num_devices <= 4) { printk(KERN_ERR "btrfs: unable to go below four devices " "on raid10\n"); ret = -EINVAL; @@ -1143,7 +1143,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) } if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && - root->fs_info->fs_devices->rw_devices <= 2) { + root->fs_info->fs_devices->num_devices <= 2) { printk(KERN_ERR "btrfs: unable to go below two " "devices on raid1\n"); ret = -EINVAL; @@ -1434,8 +1434,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return -EINVAL; bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); - if (!bdev) - return -EIO; + if (IS_ERR(bdev)) + return PTR_ERR(bdev); if (root->fs_info->fs_devices->seeding) { seeding_dev = 1; @@ -2209,7 +2209,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, max_chunk_size = 10 * calc_size; min_stripe_size = 64 * 1024 * 1024; } else if (type & BTRFS_BLOCK_GROUP_METADATA) { - max_chunk_size = 4 * calc_size; + max_chunk_size = 256 * 1024 * 1024; min_stripe_size = 32 * 1024 * 1024; } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { calc_size = 8 * 1024 * 1024; @@ -2538,6 +2538,11 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; + if (btrfs_test_opt(root, DEGRADED)) { + free_extent_map(em); + return 0; + } + map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { if (!map->stripes[i].dev->writeable) { @@ -2649,8 +2654,10 @@ again: em = lookup_extent_mapping(em_tree, logical, *length); read_unlock(&em_tree->lock); - if (!em && unplug_page) + if (!em && unplug_page) { + kfree(multi); return 0; + } if (!em) { printk(KERN_CRIT "unable to find logical %llu len %llu\n", diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index b6dd5967c48..193b58f7d3f 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -85,22 +85,23 @@ out: return ret; } -int __btrfs_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int do_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) { struct btrfs_dir_item *di; struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_trans_handle *trans; struct btrfs_path *path; - int ret = 0, mod = 0; + size_t name_len = strlen(name); + int ret = 0; + + if (name_len + size > BTRFS_MAX_XATTR_SIZE(root)) + return -ENOSPC; path = btrfs_alloc_path(); if (!path) return -ENOMEM; - trans = btrfs_join_transaction(root, 1); - btrfs_set_trans_block_group(trans, inode); - /* first lets see if we already have this xattr */ di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, strlen(name), -1); @@ -118,15 +119,12 @@ int __btrfs_setxattr(struct inode *inode, const char *name, } ret = btrfs_delete_one_dir_name(trans, root, path, di); - if (ret) - goto out; + BUG_ON(ret); btrfs_release_path(root, path); /* if we don't have a value then we are removing the xattr */ - if (!value) { - mod = 1; + if (!value) goto out; - } } else { btrfs_release_path(root, path); @@ -138,20 +136,45 @@ int __btrfs_setxattr(struct inode *inode, const char *name, } /* ok we have to create a completely new xattr */ - ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), - value, size, inode->i_ino); + ret = btrfs_insert_xattr_item(trans, root, path, inode->i_ino, + name, name_len, value, size); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + if (trans) + return do_setxattr(trans, inode, name, value, size, flags); + + ret = btrfs_reserve_metadata_space(root, 2); if (ret) - goto out; - mod = 1; + return ret; -out: - if (mod) { - inode->i_ctime = CURRENT_TIME; - ret = btrfs_update_inode(trans, root, inode); + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; } + btrfs_set_trans_block_group(trans, inode); - btrfs_end_transaction(trans, root); - btrfs_free_path(path); + ret = do_setxattr(trans, inode, name, value, size, flags); + if (ret) + goto out; + + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); +out: + btrfs_end_transaction_throttle(trans, root); + btrfs_unreserve_metadata_space(root, 2); return ret; } @@ -314,7 +337,9 @@ int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (size == 0) value = ""; /* empty EA, do not remove */ - return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); + + return __btrfs_setxattr(NULL, dentry->d_inode, name, value, size, + flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) @@ -329,10 +354,13 @@ int btrfs_removexattr(struct dentry *dentry, const char *name) if (!btrfs_is_valid_xattr(name)) return -EOPNOTSUPP; - return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); + + return __btrfs_setxattr(NULL, dentry->d_inode, name, NULL, 0, + XATTR_REPLACE); } -int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) +int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir) { int err; size_t len; @@ -354,7 +382,7 @@ int btrfs_xattr_security_init(struct inode *inode, struct inode *dir) } else { strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, suffix); - err = __btrfs_setxattr(inode, name, value, len, 0); + err = __btrfs_setxattr(trans, inode, name, value, len, 0); kfree(name); } diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index c71e9c3cf3f..721efa0346e 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -27,15 +27,16 @@ extern struct xattr_handler *btrfs_xattr_handlers[]; extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, void *buffer, size_t size); -extern int __btrfs_setxattr(struct inode *inode, const char *name, - const void *value, size_t size, int flags); - +extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, + struct inode *inode, const char *name, + const void *value, size_t size, int flags); extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t size); extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); -extern int btrfs_xattr_security_init(struct inode *inode, struct inode *dir); +extern int btrfs_xattr_security_init(struct btrfs_trans_handle *trans, + struct inode *inode, struct inode *dir); #endif /* __XATTR__ */ diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 094ea65afc8..7b2600b380d 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -5,7 +5,9 @@ have duplicated data). Fix oops in cifs_lookup. Workaround problem mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session. Disable use of server inode numbers when server only partially supports them (e.g. for one server querying inode numbers on -FindFirst fails but QPathInfo queries works). +FindFirst fails but QPathInfo queries works). Fix oops with dfs in +cifs_put_smb_ses. Fix mmap to work on directio mounts (needed +for OpenOffice when on forcedirectio mount e.g.) Version 1.60 ------------- diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index fea9e898c4b..b44ce0a0711 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -269,7 +269,7 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, int err; mntget(newmnt); - err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags, mntlist); + err = do_add_mount(newmnt, &nd->path, nd->path.mnt->mnt_flags | MNT_SHRINKABLE, mntlist); switch (err) { case 0: path_put(&nd->path); @@ -371,7 +371,6 @@ cifs_dfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) if (IS_ERR(mnt)) goto out_err; - nd->path.mnt->mnt_flags |= MNT_SHRINKABLE; rc = add_mount_helper(mnt, nd, &cifs_dfs_automount_list); out: diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 29f1da761bb..8c6a0362717 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -758,7 +758,7 @@ const struct file_operations cifs_file_ops = { }; const struct file_operations cifs_file_direct_ops = { - /* no mmap, no aio, no readv - + /* no aio, no readv - BB reevaluate whether they can be done with directio, no cache */ .read = cifs_user_read, .write = cifs_user_write, @@ -767,6 +767,7 @@ const struct file_operations cifs_file_direct_ops = { .lock = cifs_lock, .fsync = cifs_fsync, .flush = cifs_flush, + .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, #ifdef CONFIG_CIFS_POSIX .unlocked_ioctl = cifs_ioctl, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 63ea83ff687..3bbcaa716b3 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2287,12 +2287,12 @@ int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data_global, const char *devname) { - int rc = 0; + int rc; int xid; struct smb_vol *volume_info; - struct cifsSesInfo *pSesInfo = NULL; - struct cifsTconInfo *tcon = NULL; - struct TCP_Server_Info *srvTcp = NULL; + struct cifsSesInfo *pSesInfo; + struct cifsTconInfo *tcon; + struct TCP_Server_Info *srvTcp; char *full_path; char *mount_data = mount_data_global; #ifdef CONFIG_CIFS_DFS_UPCALL @@ -2301,6 +2301,10 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, int referral_walks_count = 0; try_mount_again: #endif + rc = 0; + tcon = NULL; + pSesInfo = NULL; + srvTcp = NULL; full_path = NULL; xid = GetXid(); @@ -2597,6 +2601,7 @@ remote_path_check: cleanup_volume_info(&volume_info); referral_walks_count++; + FreeXid(xid); goto try_mount_again; } #else /* No DFS support, return error on mount */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 14cbc831422..c5c45de1a2e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1005,6 +1005,9 @@ COMPATIBLE_IOCTL(SCSI_IOCTL_SEND_COMMAND) COMPATIBLE_IOCTL(SCSI_IOCTL_PROBE_HOST) COMPATIBLE_IOCTL(SCSI_IOCTL_GET_PCI) #endif +/* Big V (don't complain on serial console) */ +IGNORE_IOCTL(VT_OPENQRY) +IGNORE_IOCTL(VT_GETMODE) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) @@ -1600,8 +1603,6 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case KDSKBMETA: case KDSKBLED: case KDSETLED: - /* SG stuff */ - case SG_SET_TRANSFORM: /* AUTOFS */ case AUTOFS_IOC_READY: case AUTOFS_IOC_FAIL: diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index c8afa6b1d91..32a5f46b115 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -121,8 +121,10 @@ static int get_target(const char *symname, struct path *path, ret = -ENOENT; path_put(path); } - } else + } else { ret = -EPERM; + path_put(path); + } } return ret; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b486169f42b..274ac865bae 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -160,15 +160,8 @@ static int debugfs_create_by_name(const char *name, mode_t mode, * block. A pointer to that is in the struct vfsmount that we * have around. */ - if (!parent) { - if (debugfs_mount && debugfs_mount->mnt_sb) { - parent = debugfs_mount->mnt_sb->s_root; - } - } - if (!parent) { - pr_debug("debugfs: Ah! can not find a parent!\n"); - return -EFAULT; - } + if (!parent) + parent = debugfs_mount->mnt_sb->s_root; *dentry = NULL; mutex_lock(&parent->d_inode->i_mutex); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index fbb6e5eed69..7cb0a59f4b9 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1748,7 +1748,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, char *cipher_name, size_t *key_size) { char dummy_key[ECRYPTFS_MAX_KEY_BYTES]; - char *full_alg_name; + char *full_alg_name = NULL; int rc; *key_tfm = NULL; @@ -1763,7 +1763,6 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, if (rc) goto out; *key_tfm = crypto_alloc_blkcipher(full_alg_name, 0, CRYPTO_ALG_ASYNC); - kfree(full_alg_name); if (IS_ERR(*key_tfm)) { rc = PTR_ERR(*key_tfm); printk(KERN_ERR "Unable to allocate crypto cipher with name " @@ -1786,6 +1785,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, goto out; } out: + kfree(full_alg_name); return rc; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9e944057001..678172b61be 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -158,7 +158,7 @@ static int ecryptfs_open(struct inode *inode, struct file *file) struct dentry *ecryptfs_dentry = file->f_path.dentry; /* Private value of ecryptfs_dentry allocated in * ecryptfs_lookup() */ - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); + struct dentry *lower_dentry; struct ecryptfs_file_info *file_info; mount_crypt_stat = &ecryptfs_superblock_to_private( @@ -191,13 +191,6 @@ static int ecryptfs_open(struct inode *inode, struct file *file) | ECRYPTFS_ENCRYPTED); } mutex_unlock(&crypt_stat->cs_mutex); - if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) - && !(file->f_flags & O_RDONLY)) { - rc = -EPERM; - printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " - "file must hence be opened RO\n", __func__); - goto out; - } if (!ecryptfs_inode_to_private(inode)->lower_file) { rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { @@ -208,6 +201,13 @@ static int ecryptfs_open(struct inode *inode, struct file *file) goto out; } } + if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_RDONLY) + && !(file->f_flags & O_RDONLY)) { + rc = -EPERM; + printk(KERN_WARNING "%s: Lower persistent file is RO; eCryptfs " + "file must hence be opened RO\n", __func__); + goto out; + } ecryptfs_set_file_lower( file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { @@ -299,7 +299,6 @@ static int ecryptfs_ioctl(struct inode *inode, struct file *file, const struct file_operations ecryptfs_dir_fops = { .readdir = ecryptfs_readdir, .ioctl = ecryptfs_ioctl, - .mmap = generic_file_mmap, .open = ecryptfs_open, .flush = ecryptfs_flush, .release = ecryptfs_release, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 429ca0b3ba0..4a430ab4115 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -282,7 +282,8 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out; } rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, - ecryptfs_dir_inode->i_sb, 1); + ecryptfs_dir_inode->i_sb, + ECRYPTFS_INTERPOSE_FLAG_D_ADD); if (rc) { printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", __func__, rc); @@ -463,9 +464,6 @@ out_lock: unlock_dir(lower_dir_dentry); dput(lower_new_dentry); dput(lower_old_dentry); - d_drop(lower_old_dentry); - d_drop(new_dentry); - d_drop(old_dentry); return rc; } @@ -614,6 +612,7 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct dentry *lower_new_dentry; struct dentry *lower_old_dir_dentry; struct dentry *lower_new_dir_dentry; + struct dentry *trap = NULL; lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); @@ -621,7 +620,17 @@ ecryptfs_rename(struct inode *old_dir, struct dentry *old_dentry, dget(lower_new_dentry); lower_old_dir_dentry = dget_parent(lower_old_dentry); lower_new_dir_dentry = dget_parent(lower_new_dentry); - lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry); + /* source should not be ancestor of target */ + if (trap == lower_old_dentry) { + rc = -EINVAL; + goto out_lock; + } + /* target should not be ancestor of source */ + if (trap == lower_new_dentry) { + rc = -ENOTEMPTY; + goto out_lock; + } rc = vfs_rename(lower_old_dir_dentry->d_inode, lower_old_dentry, lower_new_dir_dentry->d_inode, lower_new_dentry); if (rc) @@ -715,31 +724,31 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) /* Released in ecryptfs_put_link(); only release here on error */ buf = kmalloc(len, GFP_KERNEL); if (!buf) { - rc = -ENOMEM; + buf = ERR_PTR(-ENOMEM); goto out; } old_fs = get_fs(); set_fs(get_ds()); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); set_fs(old_fs); - if (rc < 0) - goto out_free; - else + if (rc < 0) { + kfree(buf); + buf = ERR_PTR(rc); + } else buf[rc] = '\0'; - rc = 0; - nd_set_link(nd, buf); - goto out; -out_free: - kfree(buf); out: - return ERR_PTR(rc); + nd_set_link(nd, buf); + return NULL; } static void ecryptfs_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) { - /* Free the char* */ - kfree(nd_get_link(nd)); + char *buf = nd_get_link(nd); + if (!IS_ERR(buf)) { + /* Free the char* */ + kfree(buf); + } } /** @@ -772,18 +781,23 @@ upper_size_to_lower_size(struct ecryptfs_crypt_stat *crypt_stat, } /** - * ecryptfs_truncate + * truncate_upper * @dentry: The ecryptfs layer dentry - * @new_length: The length to expand the file to + * @ia: Address of the ecryptfs inode's attributes + * @lower_ia: Address of the lower inode's attributes * * Function to handle truncations modifying the size of the file. Note * that the file sizes are interpolated. When expanding, we are simply - * writing strings of 0's out. When truncating, we need to modify the - * underlying file size according to the page index interpolations. + * writing strings of 0's out. When truncating, we truncate the upper + * inode and update the lower_ia according to the page index + * interpolations. If ATTR_SIZE is set in lower_ia->ia_valid upon return, + * the caller must use lower_ia in a call to notify_change() to perform + * the truncation of the lower inode. * * Returns zero on success; non-zero otherwise */ -int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +static int truncate_upper(struct dentry *dentry, struct iattr *ia, + struct iattr *lower_ia) { int rc = 0; struct inode *inode = dentry->d_inode; @@ -794,8 +808,10 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) loff_t lower_size_before_truncate; loff_t lower_size_after_truncate; - if (unlikely((new_length == i_size))) + if (unlikely((ia->ia_size == i_size))) { + lower_ia->ia_valid &= ~ATTR_SIZE; goto out; + } crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Set up a fake ecryptfs file, this is used to interface with * the file in the underlying filesystem so that the @@ -815,28 +831,30 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) &fake_ecryptfs_file, ecryptfs_inode_to_private(dentry->d_inode)->lower_file); /* Switch on growing or shrinking file */ - if (new_length > i_size) { + if (ia->ia_size > i_size) { char zero[] = { 0x00 }; + lower_ia->ia_valid &= ~ATTR_SIZE; /* Write a single 0 at the last position of the file; * this triggers code that will fill in 0's throughout * the intermediate portion of the previous end of the * file and the new and of the file */ rc = ecryptfs_write(&fake_ecryptfs_file, zero, - (new_length - 1), 1); - } else { /* new_length < i_size_read(inode) */ - /* We're chopping off all the pages down do the page - * in which new_length is located. Fill in the end of - * that page from (new_length & ~PAGE_CACHE_MASK) to + (ia->ia_size - 1), 1); + } else { /* ia->ia_size < i_size_read(inode) */ + /* We're chopping off all the pages down to the page + * in which ia->ia_size is located. Fill in the end of + * that page from (ia->ia_size & ~PAGE_CACHE_MASK) to * PAGE_CACHE_SIZE with zeros. */ size_t num_zeros = (PAGE_CACHE_SIZE - - (new_length & ~PAGE_CACHE_MASK)); + - (ia->ia_size & ~PAGE_CACHE_MASK)); if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED)) { - rc = vmtruncate(inode, new_length); + rc = vmtruncate(inode, ia->ia_size); if (rc) goto out_free; - rc = vmtruncate(lower_dentry->d_inode, new_length); + lower_ia->ia_size = ia->ia_size; + lower_ia->ia_valid |= ATTR_SIZE; goto out_free; } if (num_zeros) { @@ -848,7 +866,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_free; } rc = ecryptfs_write(&fake_ecryptfs_file, zeros_virt, - new_length, num_zeros); + ia->ia_size, num_zeros); kfree(zeros_virt); if (rc) { printk(KERN_ERR "Error attempting to zero out " @@ -857,7 +875,7 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) goto out_free; } } - vmtruncate(inode, new_length); + vmtruncate(inode, ia->ia_size); rc = ecryptfs_write_inode_size_to_metadata(inode); if (rc) { printk(KERN_ERR "Problem with " @@ -870,10 +888,12 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) lower_size_before_truncate = upper_size_to_lower_size(crypt_stat, i_size); lower_size_after_truncate = - upper_size_to_lower_size(crypt_stat, new_length); - if (lower_size_after_truncate < lower_size_before_truncate) - vmtruncate(lower_dentry->d_inode, - lower_size_after_truncate); + upper_size_to_lower_size(crypt_stat, ia->ia_size); + if (lower_size_after_truncate < lower_size_before_truncate) { + lower_ia->ia_size = lower_size_after_truncate; + lower_ia->ia_valid |= ATTR_SIZE; + } else + lower_ia->ia_valid &= ~ATTR_SIZE; } out_free: if (ecryptfs_file_to_private(&fake_ecryptfs_file)) @@ -883,6 +903,33 @@ out: return rc; } +/** + * ecryptfs_truncate + * @dentry: The ecryptfs layer dentry + * @new_length: The length to expand the file to + * + * Simple function that handles the truncation of an eCryptfs inode and + * its corresponding lower inode. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) +{ + struct iattr ia = { .ia_valid = ATTR_SIZE, .ia_size = new_length }; + struct iattr lower_ia = { .ia_valid = 0 }; + int rc; + + rc = truncate_upper(dentry, &ia, &lower_ia); + if (!rc && lower_ia.ia_valid & ATTR_SIZE) { + struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); + + mutex_lock(&lower_dentry->d_inode->i_mutex); + rc = notify_change(lower_dentry, &lower_ia); + mutex_unlock(&lower_dentry->d_inode->i_mutex); + } + return rc; +} + static int ecryptfs_permission(struct inode *inode, int mask) { @@ -905,6 +952,7 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) { int rc = 0; struct dentry *lower_dentry; + struct iattr lower_ia; struct inode *inode; struct inode *lower_inode; struct ecryptfs_crypt_stat *crypt_stat; @@ -943,15 +991,11 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) } } mutex_unlock(&crypt_stat->cs_mutex); + memcpy(&lower_ia, ia, sizeof(lower_ia)); + if (ia->ia_valid & ATTR_FILE) + lower_ia.ia_file = ecryptfs_file_to_lower(ia->ia_file); if (ia->ia_valid & ATTR_SIZE) { - ecryptfs_printk(KERN_DEBUG, - "ia->ia_valid = [0x%x] ATTR_SIZE" " = [0x%x]\n", - ia->ia_valid, ATTR_SIZE); - rc = ecryptfs_truncate(dentry, ia->ia_size); - /* ecryptfs_truncate handles resizing of the lower file */ - ia->ia_valid &= ~ATTR_SIZE; - ecryptfs_printk(KERN_DEBUG, "ia->ia_valid = [%x]\n", - ia->ia_valid); + rc = truncate_upper(dentry, ia, &lower_ia); if (rc < 0) goto out; } @@ -960,17 +1004,32 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) * mode change is for clearing setuid/setgid bits. Allow lower fs * to interpret this in its own way. */ - if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) - ia->ia_valid &= ~ATTR_MODE; + if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) + lower_ia.ia_valid &= ~ATTR_MODE; mutex_lock(&lower_dentry->d_inode->i_mutex); - rc = notify_change(lower_dentry, ia); + rc = notify_change(lower_dentry, &lower_ia); mutex_unlock(&lower_dentry->d_inode->i_mutex); out: fsstack_copy_attr_all(inode, lower_inode); return rc; } +int ecryptfs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct kstat lower_stat; + int rc; + + rc = vfs_getattr(ecryptfs_dentry_to_lower_mnt(dentry), + ecryptfs_dentry_to_lower(dentry), &lower_stat); + if (!rc) { + generic_fillattr(dentry->d_inode, stat); + stat->blocks = lower_stat.blocks; + } + return rc; +} + int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) @@ -1100,6 +1159,7 @@ const struct inode_operations ecryptfs_dir_iops = { const struct inode_operations ecryptfs_main_iops = { .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, + .getattr = ecryptfs_getattr, .setxattr = ecryptfs_setxattr, .getxattr = ecryptfs_getxattr, .listxattr = ecryptfs_listxattr, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 567bc4b9f70..ea2f92101df 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -585,8 +585,8 @@ out: * with as much information as it can before needing * the lower filesystem. * ecryptfs_read_super(): this accesses the lower filesystem and uses - * ecryptfs_interpolate to perform most of the linking - * ecryptfs_interpolate(): links the lower filesystem into ecryptfs + * ecryptfs_interpose to perform most of the linking + * ecryptfs_interpose(): links the lower filesystem into ecryptfs (inode.c) */ static int ecryptfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, diff --git a/fs/eventfd.c b/fs/eventfd.c index 8b47e4200e6..7758cc382ef 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -135,26 +135,71 @@ static unsigned int eventfd_poll(struct file *file, poll_table *wait) return events; } -static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) +static void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt) +{ + *cnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; + ctx->count -= *cnt; +} + +/** + * eventfd_ctx_remove_wait_queue - Read the current counter and removes wait queue. + * @ctx: [in] Pointer to eventfd context. + * @wait: [in] Wait queue to be removed. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked. + * + * This is used to atomically remove a wait queue entry from the eventfd wait + * queue head, and read/reset the counter value. + */ +int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_t *wait, + __u64 *cnt) +{ + unsigned long flags; + + spin_lock_irqsave(&ctx->wqh.lock, flags); + eventfd_ctx_do_read(ctx, cnt); + __remove_wait_queue(&ctx->wqh, wait); + if (*cnt != 0 && waitqueue_active(&ctx->wqh)) + wake_up_locked_poll(&ctx->wqh, POLLOUT); + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return *cnt != 0 ? 0 : -EAGAIN; +} +EXPORT_SYMBOL_GPL(eventfd_ctx_remove_wait_queue); + +/** + * eventfd_ctx_read - Reads the eventfd counter or wait if it is zero. + * @ctx: [in] Pointer to eventfd context. + * @no_wait: [in] Different from zero if the operation should not block. + * @cnt: [out] Pointer to the 64bit conter value. + * + * Returns zero if successful, or the following error codes: + * + * -EAGAIN : The operation would have blocked but @no_wait was nonzero. + * -ERESTARTSYS : A signal interrupted the wait operation. + * + * If @no_wait is zero, the function might sleep until the eventfd internal + * counter becomes greater than zero. + */ +ssize_t eventfd_ctx_read(struct eventfd_ctx *ctx, int no_wait, __u64 *cnt) { - struct eventfd_ctx *ctx = file->private_data; ssize_t res; - __u64 ucnt = 0; DECLARE_WAITQUEUE(wait, current); - if (count < sizeof(ucnt)) - return -EINVAL; spin_lock_irq(&ctx->wqh.lock); + *cnt = 0; res = -EAGAIN; if (ctx->count > 0) - res = sizeof(ucnt); - else if (!(file->f_flags & O_NONBLOCK)) { + res = 0; + else if (!no_wait) { __add_wait_queue(&ctx->wqh, &wait); - for (res = 0;;) { + for (;;) { set_current_state(TASK_INTERRUPTIBLE); if (ctx->count > 0) { - res = sizeof(ucnt); + res = 0; break; } if (signal_pending(current)) { @@ -168,18 +213,32 @@ static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, __remove_wait_queue(&ctx->wqh, &wait); __set_current_state(TASK_RUNNING); } - if (likely(res > 0)) { - ucnt = (ctx->flags & EFD_SEMAPHORE) ? 1 : ctx->count; - ctx->count -= ucnt; + if (likely(res == 0)) { + eventfd_ctx_do_read(ctx, cnt); if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, POLLOUT); } spin_unlock_irq(&ctx->wqh.lock); - if (res > 0 && put_user(ucnt, (__u64 __user *) buf)) - return -EFAULT; return res; } +EXPORT_SYMBOL_GPL(eventfd_ctx_read); + +static ssize_t eventfd_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct eventfd_ctx *ctx = file->private_data; + ssize_t res; + __u64 cnt; + + if (count < sizeof(cnt)) + return -EINVAL; + res = eventfd_ctx_read(ctx, file->f_flags & O_NONBLOCK, &cnt); + if (res < 0) + return res; + + return put_user(cnt, (__u64 __user *) buf) ? -EFAULT : sizeof(cnt); +} static ssize_t eventfd_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) @@ -339,7 +398,7 @@ struct file *eventfd_file_create(unsigned int count, int flags) ctx->flags = flags; file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, - flags & EFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & EFD_SHARED_FCNTL_FLAGS)); if (IS_ERR(file)) eventfd_free_ctx(ctx); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 366c503f965..bd056a5b4ef 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1206,7 +1206,7 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) * a file structure and a free file descriptor. */ error = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep, - flags & O_CLOEXEC); + O_RDWR | (flags & O_CLOEXEC)); if (error < 0) ep_free(ep); diff --git a/fs/exec.c b/fs/exec.c index 623a5cc3076..0790a107ff7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -826,7 +826,9 @@ static int de_thread(struct task_struct *tsk) attach_pid(tsk, PIDTYPE_PID, task_pid(leader)); transfer_pid(leader, tsk, PIDTYPE_PGID); transfer_pid(leader, tsk, PIDTYPE_SID); + list_replace_rcu(&leader->tasks, &tsk->tasks); + list_replace_init(&leader->sibling, &tsk->sibling); tsk->group_leader = tsk; leader->group_leader = tsk; @@ -939,9 +941,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) int flush_old_exec(struct linux_binprm * bprm) { - char * name; - int i, ch, retval; - char tcomm[sizeof(current->comm)]; + int retval; /* * Make sure we have a private signal table and that @@ -962,6 +962,25 @@ int flush_old_exec(struct linux_binprm * bprm) bprm->mm = NULL; /* We're using it now */ + current->flags &= ~PF_RANDOMIZE; + flush_thread(); + current->personality &= ~bprm->per_clear; + + return 0; + +out: + return retval; +} +EXPORT_SYMBOL(flush_old_exec); + +void setup_new_exec(struct linux_binprm * bprm) +{ + int i, ch; + char * name; + char tcomm[sizeof(current->comm)]; + + arch_pick_mmap_layout(current->mm); + /* This is the point of no return */ current->sas_ss_sp = current->sas_ss_size = 0; @@ -983,9 +1002,6 @@ int flush_old_exec(struct linux_binprm * bprm) tcomm[i] = '\0'; set_task_comm(current, tcomm); - current->flags &= ~PF_RANDOMIZE; - flush_thread(); - /* Set the new mm task size. We have to do that late because it may * depend on TIF_32BIT which is only updated in flush_thread() on * some architectures like powerpc @@ -1001,8 +1017,6 @@ int flush_old_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } - current->personality &= ~bprm->per_clear; - /* * Flush performance counters when crossing a * security domain: @@ -1017,14 +1031,8 @@ int flush_old_exec(struct linux_binprm * bprm) flush_signal_handlers(current, 0); flush_old_files(current->files); - - return 0; - -out: - return retval; } - -EXPORT_SYMBOL(flush_old_exec); +EXPORT_SYMBOL(setup_new_exec); /* * Prepare credentials and lock ->cred_guard_mutex. @@ -1761,17 +1769,20 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) struct mm_struct *mm = current->mm; struct linux_binfmt * binfmt; struct inode * inode; - struct file * file; const struct cred *old_cred; struct cred *cred; int retval = 0; int flag = 0; int ispipe = 0; - unsigned long core_limit = current->signal->rlim[RLIMIT_CORE].rlim_cur; char **helper_argv = NULL; int helper_argc = 0; int dump_count = 0; static atomic_t core_dump_count = ATOMIC_INIT(0); + struct coredump_params cprm = { + .signr = signr, + .regs = regs, + .limit = current->signal->rlim[RLIMIT_CORE].rlim_cur, + }; audit_core_dumps(signr); @@ -1827,15 +1838,15 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) ispipe = format_corename(corename, signr); unlock_kernel(); - if ((!ispipe) && (core_limit < binfmt->min_coredump)) + if ((!ispipe) && (cprm.limit < binfmt->min_coredump)) goto fail_unlock; if (ispipe) { - if (core_limit == 0) { + if (cprm.limit == 0) { /* * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use - * core_limit of 0 here as a speacial value. Any + * cprm.limit of 0 here as a speacial value. Any * non-zero limit gets set to RLIM_INFINITY below, but * a limit of 0 skips the dump. This is a consistent * way to catch recursive crashes. We can still crash @@ -1868,25 +1879,25 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) goto fail_dropcount; } - core_limit = RLIM_INFINITY; + cprm.limit = RLIM_INFINITY; /* SIGPIPE can happen, but it's just never processed */ if (call_usermodehelper_pipe(helper_argv[0], helper_argv, NULL, - &file)) { + &cprm.file)) { printk(KERN_INFO "Core dump to %s pipe failed\n", corename); goto fail_dropcount; } } else - file = filp_open(corename, + cprm.file = filp_open(corename, O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag, 0600); - if (IS_ERR(file)) + if (IS_ERR(cprm.file)) goto fail_dropcount; - inode = file->f_path.dentry->d_inode; + inode = cprm.file->f_path.dentry->d_inode; if (inode->i_nlink > 1) goto close_fail; /* multiple links - don't dump */ - if (!ispipe && d_unhashed(file->f_path.dentry)) + if (!ispipe && d_unhashed(cprm.file->f_path.dentry)) goto close_fail; /* AK: actually i see no reason to not allow this for named pipes etc., @@ -1899,21 +1910,22 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ if (inode->i_uid != current_fsuid()) goto close_fail; - if (!file->f_op) + if (!cprm.file->f_op) goto close_fail; - if (!file->f_op->write) + if (!cprm.file->f_op->write) goto close_fail; - if (!ispipe && do_truncate(file->f_path.dentry, 0, 0, file) != 0) + if (!ispipe && + do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file) != 0) goto close_fail; - retval = binfmt->core_dump(signr, regs, file, core_limit); + retval = binfmt->core_dump(&cprm); if (retval) current->signal->group_exit_code |= 0x80; close_fail: if (ispipe && core_pipe_limit) - wait_for_dump_helpers(file); - filp_close(file, NULL); + wait_for_dump_helpers(cprm.file); + filp_close(cprm.file, NULL); fail_dropcount: if (dump_count) atomic_dec(&core_dump_count); diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 698a8636d39..2afbcebeda7 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -738,13 +738,28 @@ static int exofs_write_begin_export(struct file *file, fsdata); } +static int exofs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + struct inode *inode = mapping->host; + /* According to comment in simple_write_end i_mutex is held */ + loff_t i_size = inode->i_size; + int ret; + + ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata); + if (i_size != inode->i_size) + mark_inode_dirty(inode); + return ret; +} + const struct address_space_operations exofs_aops = { .readpage = exofs_readpage, .readpages = exofs_readpages, .writepage = exofs_writepage, .writepages = exofs_writepages, .write_begin = exofs_write_begin_export, - .write_end = simple_write_end, + .write_end = exofs_write_end, }; /****************************************************************************** diff --git a/fs/exofs/pnfs.h b/fs/exofs/pnfs.h index 423033addd1..c52e9888b8a 100644 --- a/fs/exofs/pnfs.h +++ b/fs/exofs/pnfs.h @@ -15,13 +15,7 @@ #ifndef __EXOFS_PNFS_H__ #define __EXOFS_PNFS_H__ -#if defined(CONFIG_PNFS) - - -/* FIXME: move this file to: linux/exportfs/pnfs_osd_xdr.h */ -#include "../nfs/objlayout/pnfs_osd_xdr.h" - -#else /* defined(CONFIG_PNFS) */ +#if ! defined(__PNFS_OSD_XDR_H__) enum pnfs_iomode { IOMODE_READ = 1, @@ -46,6 +40,6 @@ struct pnfs_osd_data_map { u32 odm_raid_algorithm; }; -#endif /* else defined(CONFIG_PNFS) */ +#endif /* ! defined(__PNFS_OSD_XDR_H__) */ #endif /* __EXOFS_PNFS_H__ */ diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index ad14227f509..455e6e6e5cb 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -970,7 +970,7 @@ static int ext3_get_block(struct inode *inode, sector_t iblock, if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; handle = ext3_journal_start(inode, DIO_CREDITS + - 2 * EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb)); + EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -3146,8 +3146,8 @@ int ext3_setattr(struct dentry *dentry, struct iattr *attr) /* (user+group)*(old+new) structure, inode write (sb, * inode block, ? - but truncate inode update has it) */ - handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+ - EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3); + handle = ext3_journal_start(inode, EXT3_MAXQUOTAS_INIT_BLOCKS(inode->i_sb)+ + EXT3_MAXQUOTAS_DEL_BLOCKS(inode->i_sb)+3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; @@ -3239,7 +3239,7 @@ static int ext3_writepage_trans_blocks(struct inode *inode) #ifdef CONFIG_QUOTA /* We know that structure was already allocated during vfs_dq_init so * we will be updating only the data blocks + inodes */ - ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb); + ret += EXT3_MAXQUOTAS_TRANS_BLOCKS(inode->i_sb); #endif return ret; diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index aad6400c9b7..7b0e44f7d66 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1699,7 +1699,7 @@ static int ext3_create (struct inode * dir, struct dentry * dentry, int mode, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1733,7 +1733,7 @@ static int ext3_mknod (struct inode * dir, struct dentry *dentry, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1769,7 +1769,7 @@ static int ext3_mkdir(struct inode * dir, struct dentry * dentry, int mode) retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 3 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); @@ -1920,7 +1920,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0, rc; - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_orphan_lock); if (!list_empty(&EXT3_I(inode)->i_orphan)) goto out_unlock; @@ -1929,9 +1929,13 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext3_orphan_add(). We block - * here (on lock_super()), so race with ext3_link() which might bump + * here (on s_orphan_lock), so race with ext3_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -1968,7 +1972,7 @@ int ext3_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_orphan_lock); ext3_std_error(inode->i_sb, err); return err; } @@ -1986,11 +1990,9 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) struct ext3_iloc iloc; int err = 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT3_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2040,7 +2042,7 @@ int ext3_orphan_del(handle_t *handle, struct inode *inode) out_err: ext3_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT3_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: @@ -2175,7 +2177,7 @@ static int ext3_symlink (struct inode * dir, retry: handle = ext3_journal_start(dir, EXT3_DATA_TRANS_BLOCKS(dir->i_sb) + EXT3_INDEX_EXTRA_TRANS_BLOCKS + 5 + - 2*EXT3_QUOTA_INIT_BLOCKS(dir->i_sb)); + EXT3_MAXQUOTAS_INIT_BLOCKS(dir->i_sb)); if (IS_ERR(handle)) return PTR_ERR(handle); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 5f83b617917..54351ac7cef 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -209,7 +209,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -324,7 +324,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; @@ -662,11 +662,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -825,7 +826,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -856,7 +857,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -936,7 +937,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) ext3_journal_dirty_metadata(handle, sbi->s_sbh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext3_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -973,7 +974,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); o_groups_count = EXT3_SB(sb)->s_groups_count; @@ -1045,11 +1046,11 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT3_SB(sb)->s_resize_lock); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1059,13 +1060,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_SB(sb)->s_sbh))) { ext3_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_journal_stop(handle); goto exit_put; } es->s_blocks_count = cpu_to_le32(o_blocks_count + add); ext3_journal_dirty_metadata(handle, EXT3_SB(sb)->s_sbh); - unlock_super(sb); + mutex_unlock(&EXT3_SB(sb)->s_resize_lock); ext3_debug("freeing blocks %lu through "E3FSBLK"\n", o_blocks_count, o_blocks_count + add); ext3_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 7ad1e8c30bd..afa2b569da1 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1928,6 +1928,8 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sb->dq_op = &ext3_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; @@ -2014,14 +2016,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) } ext3_setup_super (sb, es, sb->s_flags & MS_RDONLY); - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ + EXT3_SB(sb)->s_mount_state |= EXT3_ORPHAN_FS; ext3_orphan_cleanup(sb, es); EXT3_SB(sb)->s_mount_state &= ~EXT3_ORPHAN_FS; @@ -2403,13 +2398,11 @@ static void ext3_mark_recovery_complete(struct super_block * sb, if (journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT3_HAS_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT3_CLEAR_INCOMPAT_FEATURE(sb, EXT3_FEATURE_INCOMPAT_RECOVER); ext3_commit_super(sb, es, 1); } - unlock_super(sb); out: journal_unlock_updates(journal); @@ -2601,13 +2594,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) (sbi->s_mount_state & EXT3_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - unlock_super(sb); ext3_mark_recovery_complete(sb, es); - lock_super(sb); } else { __le32 ret; if ((ret = EXT3_HAS_RO_COMPAT_FEATURE(sb, diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index e5f6774846e..9ed1bb1f319 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -2,7 +2,6 @@ config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 - select FS_JOURNAL_INFO help This is the next generation of the ext3 filesystem. @@ -29,6 +28,7 @@ config EXT4_FS config EXT4_USE_FOR_EXT23 bool "Use ext4 for ext2/ext3 file systems" + depends on EXT4_FS depends on EXT3_FS=n || EXT2_FS=n default y help diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 4df8621ec31..a60ab9aad57 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -16,7 +16,6 @@ #include <linux/module.h> #include <linux/swap.h> #include <linux/pagemap.h> -#include <linux/version.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4.h" diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ab31e65d46d..874d169a193 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -361,14 +361,11 @@ struct ext4_new_group_data { so set the magic i_delalloc_reserve_flag after taking the inode allocation semaphore for */ #define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0004 - /* Call ext4_da_update_reserve_space() after successfully - allocating the blocks */ -#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0008 /* caller is from the direct IO path, request to creation of an unitialized extents if not allocated, split the uninitialized extent if blocks has been preallocated already*/ -#define EXT4_GET_BLOCKS_DIO 0x0010 -#define EXT4_GET_BLOCKS_CONVERT 0x0020 +#define EXT4_GET_BLOCKS_DIO 0x0008 +#define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_DIO_CREATE_EXT (EXT4_GET_BLOCKS_DIO|\ EXT4_GET_BLOCKS_CREATE_UNINIT_EXT) /* Convert extent to initialized after direct IO complete */ @@ -699,11 +696,17 @@ struct ext4_inode_info { unsigned int i_reserved_meta_blocks; unsigned int i_allocated_meta_blocks; unsigned short i_delalloc_reserved_flag; + sector_t i_da_metadata_calc_last_lblock; + int i_da_metadata_calc_len; /* on-disk additional length */ __u16 i_extra_isize; spinlock_t i_block_reservation_lock; +#ifdef CONFIG_QUOTA + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif /* completed async DIOs that might need unwritten extents handling */ struct list_head i_aio_dio_complete_list; @@ -1435,8 +1438,10 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); -extern qsize_t ext4_get_reserved_space(struct inode *inode); +extern qsize_t *ext4_get_reserved_space(struct inode *inode); extern int flush_aio_dio_completed_IO(struct inode *inode); +extern void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 2ca686454e8..bdb6ce7e2eb 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -225,7 +225,8 @@ static inline void ext4_ext_mark_initialized(struct ext4_extent *ext) ext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(ext)); } -extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); +extern int ext4_ext_calc_metadata_amount(struct inode *inode, + sector_t lblocks); extern ext4_fsblk_t ext_pblock(struct ext4_extent *ex); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 3a7928f825e..765a4826b11 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -296,29 +296,44 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) * to allocate @blocks * Worse case is one block per extent */ -int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks) +int ext4_ext_calc_metadata_amount(struct inode *inode, sector_t lblock) { - int lcap, icap, rcap, leafs, idxs, num; - int newextents = blocks; - - rcap = ext4_ext_space_root_idx(inode, 0); - lcap = ext4_ext_space_block(inode, 0); - icap = ext4_ext_space_block_idx(inode, 0); + struct ext4_inode_info *ei = EXT4_I(inode); + int idxs, num = 0; - /* number of new leaf blocks needed */ - num = leafs = (newextents + lcap - 1) / lcap; + idxs = ((inode->i_sb->s_blocksize - sizeof(struct ext4_extent_header)) + / sizeof(struct ext4_extent_idx)); /* - * Worse case, we need separate index block(s) - * to link all new leaf blocks + * If the new delayed allocation block is contiguous with the + * previous da block, it can share index blocks with the + * previous block, so we only need to allocate a new index + * block every idxs leaf blocks. At ldxs**2 blocks, we need + * an additional index block, and at ldxs**3 blocks, yet + * another index blocks. */ - idxs = (leafs + icap - 1) / icap; - do { - num += idxs; - idxs = (idxs + icap - 1) / icap; - } while (idxs > rcap); + if (ei->i_da_metadata_calc_len && + ei->i_da_metadata_calc_last_lblock+1 == lblock) { + if ((ei->i_da_metadata_calc_len % idxs) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs)) == 0) + num++; + if ((ei->i_da_metadata_calc_len % (idxs*idxs*idxs)) == 0) { + num++; + ei->i_da_metadata_calc_len = 0; + } else + ei->i_da_metadata_calc_len++; + ei->i_da_metadata_calc_last_lblock++; + return num; + } - return num; + /* + * In the worst case we need a new set of index blocks at + * every level of the inode's extent tree. + */ + ei->i_da_metadata_calc_len = 1; + ei->i_da_metadata_calc_last_lblock = lblock; + return ext_depth(inode) + 1; } static int @@ -3023,6 +3038,14 @@ out: return err; } +static void unmap_underlying_metadata_blocks(struct block_device *bdev, + sector_t block, int count) +{ + int i; + for (i = 0; i < count; i++) + unmap_underlying_metadata(bdev, block + i); +} + static int ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, @@ -3098,6 +3121,30 @@ out: } else allocated = ret; set_buffer_new(bh_result); + /* + * if we allocated more blocks than requested + * we need to make sure we unmap the extra block + * allocated. The actual needed block will get + * unmapped later when we find the buffer_head marked + * new. + */ + if (allocated > max_blocks) { + unmap_underlying_metadata_blocks(inode->i_sb->s_bdev, + newblock + max_blocks, + allocated - max_blocks); + allocated = max_blocks; + } + + /* + * If we have done fallocate with the offset that is already + * delayed allocated, we would have block reservation + * and quota reservation done in the delayed write path. + * But fallocate would have already updated quota and block + * count for this offset. So cancel these reservation + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 0); + map_out: set_buffer_mapped(bh_result); out1: @@ -3190,7 +3237,13 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * this situation is possible, though, _during_ tree modification; * this is why assert can't be put in ext4_ext_find_extent() */ - BUG_ON(path[depth].p_ext == NULL && depth != 0); + if (path[depth].p_ext == NULL && depth != 0) { + ext4_error(inode->i_sb, __func__, "bad extent address " + "inode: %lu, iblock: %d, depth: %d", + inode->i_ino, iblock, depth); + err = -EIO; + goto out2; + } eh = path[depth].p_hdr; ex = path[depth].p_ext; @@ -3327,9 +3380,18 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* previous routine could use block we allocated */ newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); + if (allocated > max_blocks) + allocated = max_blocks; set_buffer_new(bh_result); /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ext4_da_update_reserve_space(inode, allocated, 1); + + /* * Cache the extent and update transaction to commit on fdatasync only * when it is _not_ an uninitialized extent. */ diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 0b22497d92e..98bd140aad0 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -88,9 +88,21 @@ int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) return ext4_force_commit(inode->i_sb); commit_tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid; - if (jbd2_log_start_commit(journal, commit_tid)) + if (jbd2_log_start_commit(journal, commit_tid)) { + /* + * When the journal is on a different device than the + * fs data disk, we need to issue the barrier in + * writeback mode. (In ordered mode, the jbd2 layer + * will take care of issuing the barrier. In + * data=journal, all of the data blocks are written to + * the journal device.) + */ + if (ext4_should_writeback_data(inode) && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(inode->i_sb->s_bdev, NULL); jbd2_log_wait_commit(journal, commit_tid); - else if (journal->j_flags & JBD2_BARRIER) + } else if (journal->j_flags & JBD2_BARRIER) blkdev_issue_flush(inode->i_sb->s_bdev, NULL); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5352db1a308..e11952404e0 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1003,92 +1003,120 @@ out: return err; } -qsize_t ext4_get_reserved_space(struct inode *inode) +#ifdef CONFIG_QUOTA +qsize_t *ext4_get_reserved_space(struct inode *inode) { - unsigned long long total; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + - EXT4_I(inode)->i_reserved_meta_blocks; - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - - return (total << inode->i_blkbits); + return &EXT4_I(inode)->i_reserved_quota; } +#endif + /* * Calculate the number of metadata blocks need to reserve - * to allocate @blocks for non extent file based file + * to allocate a new block at @lblocks for non extent file based file */ -static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_indirect_calc_metadata_amount(struct inode *inode, + sector_t lblock) { - int icap = EXT4_ADDR_PER_BLOCK(inode->i_sb); - int ind_blks, dind_blks, tind_blks; - - /* number of new indirect blocks needed */ - ind_blks = (blocks + icap - 1) / icap; + struct ext4_inode_info *ei = EXT4_I(inode); + int dind_mask = EXT4_ADDR_PER_BLOCK(inode->i_sb) - 1; + int blk_bits; - dind_blks = (ind_blks + icap - 1) / icap; + if (lblock < EXT4_NDIR_BLOCKS) + return 0; - tind_blks = 1; + lblock -= EXT4_NDIR_BLOCKS; - return ind_blks + dind_blks + tind_blks; + if (ei->i_da_metadata_calc_len && + (lblock & dind_mask) == ei->i_da_metadata_calc_last_lblock) { + ei->i_da_metadata_calc_len++; + return 0; + } + ei->i_da_metadata_calc_last_lblock = lblock & dind_mask; + ei->i_da_metadata_calc_len = 1; + blk_bits = roundup_pow_of_two(lblock + 1); + return (blk_bits / EXT4_ADDR_PER_BLOCK_BITS(inode->i_sb)) + 1; } /* * Calculate the number of metadata blocks need to reserve - * to allocate given number of blocks + * to allocate a block located at @lblock */ -static int ext4_calc_metadata_amount(struct inode *inode, int blocks) +static int ext4_calc_metadata_amount(struct inode *inode, sector_t lblock) { - if (!blocks) - return 0; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_calc_metadata_amount(inode, blocks); + return ext4_ext_calc_metadata_amount(inode, lblock); - return ext4_indirect_calc_metadata_amount(inode, blocks); + return ext4_indirect_calc_metadata_amount(inode, lblock); } -static void ext4_da_update_reserve_space(struct inode *inode, int used) +/* + * Called with i_data_sem down, which is important since we can call + * ext4_discard_preallocations() from here. + */ +void ext4_da_update_reserve_space(struct inode *inode, + int used, int quota_claim) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free; - - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - used; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - if (mdb_free) { - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - - /* update fs dirty blocks counter */ + struct ext4_inode_info *ei = EXT4_I(inode); + int mdb_free = 0, allocated_meta_blocks = 0; + + spin_lock(&ei->i_block_reservation_lock); + if (unlikely(used > ei->i_reserved_data_blocks)) { + ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d " + "with only %d reserved data blocks\n", + __func__, inode->i_ino, used, + ei->i_reserved_data_blocks); + WARN_ON(1); + used = ei->i_reserved_data_blocks; + } + + /* Update per-inode reservations */ + ei->i_reserved_data_blocks -= used; + used += ei->i_allocated_meta_blocks; + ei->i_reserved_meta_blocks -= ei->i_allocated_meta_blocks; + allocated_meta_blocks = ei->i_allocated_meta_blocks; + ei->i_allocated_meta_blocks = 0; + percpu_counter_sub(&sbi->s_dirtyblocks_counter, used); + + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + mdb_free = ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); - EXT4_I(inode)->i_allocated_meta_blocks = 0; - EXT4_I(inode)->i_reserved_meta_blocks = mdb; } - - /* update per-inode reservations */ - BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= used; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - /* - * free those over-booking quota for metadata blocks - */ - if (mdb_free) - vfs_dq_release_reservation_block(inode, mdb_free); + /* Update quota subsystem */ + if (quota_claim) { + vfs_dq_claim_block(inode, used); + if (mdb_free) + vfs_dq_release_reservation_block(inode, mdb_free); + } else { + /* + * We did fallocate with an offset that is already delayed + * allocated. So on delayed allocated writeback we should + * not update the quota for allocated blocks. But then + * converting an fallocate region to initialized region would + * have caused a metadata allocation. So claim quota for + * that + */ + if (allocated_meta_blocks) + vfs_dq_claim_block(inode, allocated_meta_blocks); + vfs_dq_release_reservation_block(inode, mdb_free + used); + } /* * If we have done all the pending block allocations and if * there aren't any writers on the inode, we can discard the * inode's preallocations. */ - if (!total && (atomic_read(&inode->i_writecount) == 0)) + if ((ei->i_reserved_data_blocks == 0) && + (atomic_read(&inode->i_writecount) == 0)) ext4_discard_preallocations(inode); } @@ -1280,18 +1308,20 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, */ EXT4_I(inode)->i_state &= ~EXT4_STATE_EXT_MIGRATE; } - } + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. We don't + * support fallocate for non extent files. So we can update + * reserve space here. + */ + if ((retval > 0) && + (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) + ext4_da_update_reserve_space(inode, retval, 1); + } if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks after successful - * block allocation which had been deferred till now. - */ - if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) - ext4_da_update_reserve_space(inode, retval); - up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && buffer_mapped(bh)) { int ret = check_block_validity(inode, "file system " @@ -1797,11 +1827,15 @@ static int ext4_journalled_write_end(struct file *file, return ret ? ret : copied; } -static int ext4_da_reserve_space(struct inode *inode, int nrblocks) +/* + * Reserve a single block located at lblock + */ +static int ext4_da_reserve_space(struct inode *inode, sector_t lblock) { int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned long md_needed, mdblocks, total = 0; + struct ext4_inode_info *ei = EXT4_I(inode); + unsigned long md_needed, md_reserved; /* * recalculate the amount of metadata blocks to reserve @@ -1809,86 +1843,78 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) * worse case is one extent per block */ repeat: - spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; - mdblocks = ext4_calc_metadata_amount(inode, total); - BUG_ON(mdblocks < EXT4_I(inode)->i_reserved_meta_blocks); - - md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; - total = md_needed + nrblocks; + spin_lock(&ei->i_block_reservation_lock); + md_reserved = ei->i_reserved_meta_blocks; + md_needed = ext4_calc_metadata_amount(inode, lblock); + spin_unlock(&ei->i_block_reservation_lock); /* * Make quota reservation here to prevent quota overflow * later. Real quota accounting is done at pages writeout * time. */ - if (vfs_dq_reserve_block(inode, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (vfs_dq_reserve_block(inode, md_needed + 1)) return -EDQUOT; - } - if (ext4_claim_free_blocks(sbi, total)) { - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, total); + if (ext4_claim_free_blocks(sbi, md_needed + 1)) { + vfs_dq_release_reservation_block(inode, md_needed + 1); if (ext4_should_retry_alloc(inode->i_sb, &retries)) { yield(); goto repeat; } return -ENOSPC; } - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; - EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; + spin_lock(&ei->i_block_reservation_lock); + ei->i_reserved_data_blocks++; + ei->i_reserved_meta_blocks += md_needed; + spin_unlock(&ei->i_block_reservation_lock); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); return 0; /* success */ } static void ext4_da_release_space(struct inode *inode, int to_free) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int total, mdb, mdb_free, release; + struct ext4_inode_info *ei = EXT4_I(inode); if (!to_free) return; /* Nothing to release, exit */ spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - if (!EXT4_I(inode)->i_reserved_data_blocks) { + if (unlikely(to_free > ei->i_reserved_data_blocks)) { /* - * if there is no reserved blocks, but we try to free some - * then the counter is messed up somewhere. - * but since this function is called from invalidate - * page, it's harmless to return without any action + * if there aren't enough reserved blocks, then the + * counter is messed up somewhere. Since this + * function is called from invalidate page, it's + * harmless to return without any action. */ - printk(KERN_INFO "ext4 delalloc try to release %d reserved " - "blocks for inode %lu, but there is no reserved " - "data blocks\n", to_free, inode->i_ino); - spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - return; + ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: " + "ino %lu, to_free %d with only %d reserved " + "data blocks\n", inode->i_ino, to_free, + ei->i_reserved_data_blocks); + WARN_ON(1); + to_free = ei->i_reserved_data_blocks; } + ei->i_reserved_data_blocks -= to_free; - /* recalculate the number of metablocks still need to be reserved */ - total = EXT4_I(inode)->i_reserved_data_blocks - to_free; - mdb = ext4_calc_metadata_amount(inode, total); - - /* figure out how many metablocks to release */ - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - - release = to_free + mdb_free; - - /* update fs dirty blocks counter for truncate case */ - percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); + if (ei->i_reserved_data_blocks == 0) { + /* + * We can release all of the reserved metadata blocks + * only when we have written all of the delayed + * allocation blocks. + */ + to_free += ei->i_reserved_meta_blocks; + ei->i_reserved_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; + } - /* update per-inode reservations */ - BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); - EXT4_I(inode)->i_reserved_data_blocks -= to_free; + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, to_free); - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); - vfs_dq_release_reservation_block(inode, release); + vfs_dq_release_reservation_block(inode, to_free); } static void ext4_da_page_release_reservation(struct page *page, @@ -2193,10 +2219,10 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * variables are updated after the blocks have been allocated. */ new.b_state = 0; - get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | - EXT4_GET_BLOCKS_DELALLOC_RESERVE); + get_blocks_flags = EXT4_GET_BLOCKS_CREATE; if (mpd->b_state & (1 << BH_Delay)) - get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; + get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, &new, get_blocks_flags); if (blks < 0) { @@ -2494,7 +2520,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * XXX: __block_prepare_write() unmaps passed block, * is it OK? */ - ret = ext4_da_reserve_space(inode, 1); + ret = ext4_da_reserve_space(inode, iblock); if (ret) /* not enough space to reserve */ return ret; @@ -2968,8 +2994,7 @@ retry: out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; - if (wbc->nr_to_write > nr_to_writebump) - wbc->nr_to_write -= nr_to_writebump; + wbc->nr_to_write -= nr_to_writebump; wbc->range_start = range_start; trace_ext4_da_writepages_result(inode, wbc, ret, pages_written); return ret; @@ -2994,11 +3019,18 @@ static int ext4_nonda_switch(struct super_block *sb) if (2 * free_blocks < 3 * dirty_blocks || free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { /* - * free block count is less that 150% of dirty blocks - * or free blocks is less that watermark + * free block count is less than 150% of dirty blocks + * or free blocks is less than watermark */ return 1; } + /* + * Even if we don't switch but are nearing capacity, + * start pushing delalloc when 1/2 of free blocks are dirty. + */ + if (free_blocks < 2 * dirty_blocks) + writeback_inodes_sb_if_idle(sb); + return 0; } @@ -3006,7 +3038,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - int ret, retries = 0; + int ret, retries = 0, quota_retries = 0; struct page *page; pgoff_t index; unsigned from, to; @@ -3065,6 +3097,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + + if ((ret == -EDQUOT) && + EXT4_I(inode)->i_reserved_meta_blocks && + (quota_retries++ < 3)) { + /* + * Since we often over-estimate the number of meta + * data blocks required, we may sometimes get a + * spurios out of quota error even though there would + * be enough space once we write the data blocks and + * find out how many meta data blocks were _really_ + * required. So try forcing the inode write to see if + * that helps. + */ + write_inode_now(inode, (quota_retries == 3)); + goto retry; + } out: return ret; } @@ -4794,6 +4842,9 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(raw_inode); ei->i_disksize = inode->i_size; +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif inode->i_generation = le32_to_cpu(raw_inode->i_generation); ei->i_block_group = iloc.block_group; ei->i_last_alloc_group = ~0; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b1fd3daadc9..d34afad3e13 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2755,12 +2755,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); - else { - percpu_counter_sub(&sbi->s_dirtyblocks_counter, - ac->ac_b_ex.fe_len); - /* convert reserved quota blocks to real quota blocks */ - vfs_dq_claim_block(ac->ac_inode, ac->ac_b_ex.fe_len); - } if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 0ca811061bc..436521cae45 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -17,7 +17,6 @@ #include <linux/proc_fs.h> #include <linux/pagemap.h> #include <linux/seq_file.h> -#include <linux/version.h> #include <linux/blkdev.h> #include <linux/mutex.h> #include "ext4_jbd2.h" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 827bde1f259..735c20d5fd5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -702,8 +702,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; + ei->i_da_metadata_calc_len = 0; ei->i_delalloc_reserved_flag = 0; spin_lock_init(&(ei->i_block_reservation_lock)); +#ifdef CONFIG_QUOTA + ei->i_reserved_quota = 0; +#endif INIT_LIST_HEAD(&ei->i_aio_dio_complete_list); ei->cur_aio_dio = NULL; ei->i_sync_tid = 0; @@ -1014,7 +1018,9 @@ static const struct dquot_operations ext4_quota_operations = { .reserve_space = dquot_reserve_space, .claim_space = dquot_claim_space, .release_rsv = dquot_release_reserved_space, +#ifdef CONFIG_QUOTA .get_reserved_space = ext4_get_reserved_space, +#endif .alloc_inode = dquot_alloc_inode, .free_space = dquot_free_space, .free_inode = dquot_free_inode, @@ -2169,9 +2175,9 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, struct super_block *sb = sbi->s_buddy_cache->i_sb; return snprintf(buf, PAGE_SIZE, "%llu\n", - sbi->s_kbytes_written + + (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + EXT4_SB(sb)->s_sectors_written_start) >> 1))); } static ssize_t inode_readahead_blks_store(struct ext4_attr *a, @@ -4000,6 +4006,7 @@ static inline void unregister_as_ext2(void) { unregister_filesystem(&ext2_fs_type); } +MODULE_ALIAS("ext2"); #else static inline void register_as_ext2(void) { } static inline void unregister_as_ext2(void) { } @@ -4026,6 +4033,7 @@ static inline void unregister_as_ext3(void) { unregister_filesystem(&ext3_fs_type); } +MODULE_ALIAS("ext3"); #else static inline void register_as_ext3(void) { } static inline void unregister_as_ext3(void) { } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 83218bebbc7..f3a2f7ed45a 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1332,6 +1332,8 @@ retry: goto cleanup; kfree(b_entry_name); kfree(buffer); + b_entry_name = NULL; + buffer = NULL; brelse(is->iloc.bh); kfree(is); kfree(bs); diff --git a/fs/fcntl.c b/fs/fcntl.c index 2cf93ec40a6..97e01dc0d95 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -618,60 +618,90 @@ static DEFINE_RWLOCK(fasync_lock); static struct kmem_cache *fasync_cache __read_mostly; /* - * fasync_helper() is used by almost all character device drivers - * to set up the fasync queue. It returns negative on error, 0 if it did - * no changes and positive if it added/deleted the entry. + * Remove a fasync entry. If successfully removed, return + * positive and clear the FASYNC flag. If no entry exists, + * do nothing and return 0. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + * + * We always take the 'filp->f_lock', in since fasync_lock + * needs to be irq-safe. */ -int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +static int fasync_remove_entry(struct file *filp, struct fasync_struct **fapp) { struct fasync_struct *fa, **fp; - struct fasync_struct *new = NULL; int result = 0; - if (on) { - new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); - if (!new) - return -ENOMEM; + spin_lock(&filp->f_lock); + write_lock_irq(&fasync_lock); + for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { + if (fa->fa_file != filp) + continue; + *fp = fa->fa_next; + kmem_cache_free(fasync_cache, fa); + filp->f_flags &= ~FASYNC; + result = 1; + break; } + write_unlock_irq(&fasync_lock); + spin_unlock(&filp->f_lock); + return result; +} + +/* + * Add a fasync entry. Return negative on error, positive if + * added, and zero if did nothing but change an existing one. + * + * NOTE! It is very important that the FASYNC flag always + * match the state "is the filp on a fasync list". + */ +static int fasync_add_entry(int fd, struct file *filp, struct fasync_struct **fapp) +{ + struct fasync_struct *new, *fa, **fp; + int result = 0; + + new = kmem_cache_alloc(fasync_cache, GFP_KERNEL); + if (!new) + return -ENOMEM; - /* - * We need to take f_lock first since it's not an IRQ-safe - * lock. - */ spin_lock(&filp->f_lock); write_lock_irq(&fasync_lock); for (fp = fapp; (fa = *fp) != NULL; fp = &fa->fa_next) { - if (fa->fa_file == filp) { - if(on) { - fa->fa_fd = fd; - kmem_cache_free(fasync_cache, new); - } else { - *fp = fa->fa_next; - kmem_cache_free(fasync_cache, fa); - result = 1; - } - goto out; - } + if (fa->fa_file != filp) + continue; + fa->fa_fd = fd; + kmem_cache_free(fasync_cache, new); + goto out; } - if (on) { - new->magic = FASYNC_MAGIC; - new->fa_file = filp; - new->fa_fd = fd; - new->fa_next = *fapp; - *fapp = new; - result = 1; - } + new->magic = FASYNC_MAGIC; + new->fa_file = filp; + new->fa_fd = fd; + new->fa_next = *fapp; + *fapp = new; + result = 1; + filp->f_flags |= FASYNC; + out: - if (on) - filp->f_flags |= FASYNC; - else - filp->f_flags &= ~FASYNC; write_unlock_irq(&fasync_lock); spin_unlock(&filp->f_lock); return result; } +/* + * fasync_helper() is used by almost all character device drivers + * to set up the fasync queue, and for regular files by the file + * lease code. It returns negative on error, 0 if it did no changes + * and positive if it added/deleted the entry. + */ +int fasync_helper(int fd, struct file * filp, int on, struct fasync_struct **fapp) +{ + if (!on) + return fasync_remove_entry(filp, fapp); + return fasync_add_entry(fd, filp, fapp); +} + EXPORT_SYMBOL(fasync_helper); void __kill_fasync(struct fasync_struct *fa, int sig, int band) diff --git a/fs/file_table.c b/fs/file_table.c index 0afacf65439..b98404b5438 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -186,10 +186,8 @@ struct file *alloc_file(struct path *path, fmode_t mode, * that we can do debugging checks at __fput() */ if ((mode & FMODE_WRITE) && !special_file(path->dentry->d_inode->i_mode)) { - int error = 0; file_take_write(file); - error = mnt_clone_write(path->mnt); - WARN_ON(error); + WARN_ON(mnt_clone_write(path->mnt)); } ima_counts_get(file); return file; @@ -255,6 +253,7 @@ void __fput(struct file *file) if (file->f_op && file->f_op->release) file->f_op->release(inode, file); security_file_free(file); + ima_file_free(file); if (unlikely(S_ISCHR(inode->i_mode) && inode->i_cdev != NULL)) cdev_put(inode->i_cdev); fops_put(file->f_op); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 49bc1b8e8f1..1a7c42c64ff 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -242,6 +242,7 @@ static void bdi_sync_writeback(struct backing_dev_info *bdi, /** * bdi_start_writeback - start writeback * @bdi: the backing device to write from + * @sb: write inodes from this super_block * @nr_pages: the number of pages to write * * Description: @@ -1187,6 +1188,23 @@ void writeback_inodes_sb(struct super_block *sb) EXPORT_SYMBOL(writeback_inodes_sb); /** + * writeback_inodes_sb_if_idle - start writeback if none underway + * @sb: the superblock + * + * Invoke writeback_inodes_sb if no writeback is currently underway. + * Returns 1 if writeback was started, 0 if not. + */ +int writeback_inodes_sb_if_idle(struct super_block *sb) +{ + if (!writeback_in_progress(sb->s_bdi)) { + writeback_inodes_sb(sb); + return 1; + } else + return 0; +} +EXPORT_SYMBOL(writeback_inodes_sb_if_idle); + +/** * sync_inodes_sb - sync sb inode pages * @sb: the superblock * diff --git a/fs/fuse/file.c b/fs/fuse/file.c index c18913a777a..a9f5e137f1d 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -828,6 +828,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, if (!page) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); pagefault_enable(); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index b192c661caa..4dcddf83326 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -10,7 +10,6 @@ config GFS2_FS select SLOW_WORK select QUOTA select QUOTACTL - select FS_JOURNAL_INFO help A cluster filesystem. diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 4eb308aa323..a6abbae8a27 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -569,6 +569,40 @@ static int gfs2_fsync(struct file *file, struct dentry *dentry, int datasync) return ret; } +/** + * gfs2_file_aio_write - Perform a write to a file + * @iocb: The io context + * @iov: The data to write + * @nr_segs: Number of @iov segments + * @pos: The file position + * + * We have to do a lock/unlock here to refresh the inode size for + * O_APPEND writes, otherwise we can land up writing at the wrong + * offset. There is still a race, but provided the app is using its + * own file locking, this will make O_APPEND work as expected. + * + */ + +static ssize_t gfs2_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + + if (file->f_flags & O_APPEND) { + struct dentry *dentry = file->f_dentry; + struct gfs2_inode *ip = GFS2_I(dentry->d_inode); + struct gfs2_holder gh; + int ret; + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + return ret; + gfs2_glock_dq_uninit(&gh); + } + + return generic_file_aio_write(iocb, iov, nr_segs, pos); +} + #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** @@ -711,7 +745,7 @@ const struct file_operations gfs2_file_fops = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = gfs2_file_aio_write, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, @@ -741,7 +775,7 @@ const struct file_operations gfs2_file_fops_nolock = { .read = do_sync_read, .aio_read = generic_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = gfs2_file_aio_write, .unlocked_ioctl = gfs2_ioctl, .mmap = gfs2_mmap, .open = gfs2_open, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index f455a03a09e..f4266332593 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -769,6 +769,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, if (!gl) return -ENOMEM; + atomic_inc(&sdp->sd_glock_disposal); gl->gl_flags = 0; gl->gl_name = name; atomic_set(&gl->gl_ref, 1); @@ -1538,6 +1539,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) up_write(&gfs2_umount_flush_sem); msleep(10); } + flush_workqueue(glock_workqueue); + wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + gfs2_dump_lockstate(sdp); } void gfs2_glock_finish_truncate(struct gfs2_inode *ip) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 13f0bd22813..c0262faf472 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -123,7 +123,7 @@ struct lm_lockops { int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname); void (*lm_unmount) (struct gfs2_sbd *sdp); void (*lm_withdraw) (struct gfs2_sbd *sdp); - void (*lm_put_lock) (struct kmem_cache *cachep, void *gl); + void (*lm_put_lock) (struct kmem_cache *cachep, struct gfs2_glock *gl); unsigned int (*lm_lock) (struct gfs2_glock *gl, unsigned int req_state, unsigned int flags); void (*lm_cancel) (struct gfs2_glock *gl); diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4792200978c..bc0ad158e6b 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -544,6 +544,8 @@ struct gfs2_sbd { struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_trans_gl; + wait_queue_head_t sd_glock_wait; + atomic_t sd_glock_disposal; /* Inode Stuff */ diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 46df988323b..0e5e0e7022e 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -21,6 +21,7 @@ static void gdlm_ast(void *arg) { struct gfs2_glock *gl = arg; unsigned ret = gl->gl_state; + struct gfs2_sbd *sdp = gl->gl_sbd; BUG_ON(gl->gl_lksb.sb_flags & DLM_SBF_DEMOTED); @@ -30,6 +31,8 @@ static void gdlm_ast(void *arg) switch (gl->gl_lksb.sb_status) { case -DLM_EUNLOCK: /* Unlocked, so glock can be freed */ kmem_cache_free(gfs2_glock_cachep, gl); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); return; case -DLM_ECANCEL: /* Cancel while getting lock */ ret |= LM_OUT_CANCELED; @@ -164,14 +167,16 @@ static unsigned int gdlm_lock(struct gfs2_glock *gl, return LM_OUT_ASYNC; } -static void gdlm_put_lock(struct kmem_cache *cachep, void *ptr) +static void gdlm_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) { - struct gfs2_glock *gl = ptr; - struct lm_lockstruct *ls = &gl->gl_sbd->sd_lockstruct; + struct gfs2_sbd *sdp = gl->gl_sbd; + struct lm_lockstruct *ls = &sdp->sd_lockstruct; int error; if (gl->gl_lksb.sb_lkid == 0) { kmem_cache_free(cachep, gl); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); return; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index cb8d7a93d5e..6f68a5f18eb 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -121,7 +121,7 @@ struct inode *gfs2_aspace_get(struct gfs2_sbd *sdp) if (aspace) { mapping_set_gfp_mask(aspace->i_mapping, GFP_NOFS); aspace->i_mapping->a_ops = &aspace_aops; - aspace->i_size = ~0ULL; + aspace->i_size = MAX_LFS_FILESIZE; ip = GFS2_I(aspace); clear_bit(GIF_USER, &ip->i_flags); insert_inode_hash(aspace); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index edfee24f363..8a102f73100 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -82,6 +82,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); + init_waitqueue_head(&sdp->sd_glock_wait); + atomic_set(&sdp->sd_glock_disposal, 0); spin_lock_init(&sdp->sd_statfs_spin); spin_lock_init(&sdp->sd_rindex_spin); @@ -983,9 +985,17 @@ static const match_table_t nolock_tokens = { { Opt_err, NULL }, }; +static void nolock_put_lock(struct kmem_cache *cachep, struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_sbd; + kmem_cache_free(cachep, gl); + if (atomic_dec_and_test(&sdp->sd_glock_disposal)) + wake_up(&sdp->sd_glock_wait); +} + static const struct lm_lockops nolock_ops = { .lm_proto_name = "lock_nolock", - .lm_put_lock = kmem_cache_free, + .lm_put_lock = nolock_put_lock, .lm_tokens = &nolock_tokens, }; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 247436c10de..84350e1be66 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -748,7 +748,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - int alloc_required; + int alloc_required = 0; unsigned int x; int error; @@ -867,7 +867,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; } - alloc_required = error = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + if (nip == NULL) + alloc_required = gfs2_diradd_alloc_required(ndir, &ndentry->d_name); + error = alloc_required; if (error < 0) goto out_gunlock; error = 0; @@ -1086,7 +1088,8 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) error = vfs_follow_link(nd, buf); if (buf != array) kfree(buf); - } + } else + path_put(&nd->path); return ERR_PTR(error); } diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 0608f490c29..503b842f3ba 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -591,11 +591,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) u64 rgrp_count = ip->i_disksize; int error; - if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { - gfs2_consist_inode(ip); - return -EIO; - } - + do_div(rgrp_count, sizeof(struct gfs2_rindex)); clear_rgrpdi(sdp); file_ra_state_init(&ra_state, inode->i_mapping); @@ -915,7 +911,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip) { BUG_ON(ip->i_alloc != NULL); - ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_KERNEL); + ip->i_alloc = kzalloc(sizeof(struct gfs2_alloc), GFP_NOFS); return ip->i_alloc; } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c282ad41f3d..b9dd3da22c0 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -21,6 +21,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/time.h> +#include <linux/wait.h> #include "gfs2.h" #include "incore.h" diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 8a04108e0c2..c2ebdf2c01d 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1296,6 +1296,7 @@ fail: int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) { + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_ea_location el; struct buffer_head *dibh; int error; @@ -1305,16 +1306,17 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) return error; if (GFS2_EA_IS_STUFFED(el.el_ea)) { - error = gfs2_trans_begin(GFS2_SB(&ip->i_inode), RES_DINODE + RES_EATTR, 0); - if (error) - return error; - - gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); - memcpy(GFS2_EA2DATA(el.el_ea), data, - GFS2_EA_DATA_LEN(el.el_ea)); - } else + error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); + if (error == 0) { + gfs2_trans_add_bh(ip->i_gl, el.el_bh, 1); + memcpy(GFS2_EA2DATA(el.el_ea), data, + GFS2_EA_DATA_LEN(el.el_ea)); + } + } else { error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); + } + brelse(el.el_bh); if (error) return error; @@ -1327,8 +1329,7 @@ int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) brelse(dibh); } - gfs2_trans_end(GFS2_SB(&ip->i_inode)); - + gfs2_trans_end(sdp); return error; } diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c index a5089a6dd67..7239efc690d 100644 --- a/fs/hppfs/hppfs.c +++ b/fs/hppfs/hppfs.c @@ -646,22 +646,27 @@ static const struct super_operations hppfs_sbops = { static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct dentry *proc_dentry; - - proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; return proc_dentry->d_inode->i_op->readlink(proc_dentry, buffer, buflen); } static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) { - struct dentry *proc_dentry; - - proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; return proc_dentry->d_inode->i_op->follow_link(proc_dentry, nd); } +static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, + void *cookie) +{ + struct dentry *proc_dentry = HPPFS_I(dentry->d_inode)->proc_dentry; + + if (proc_dentry->d_inode->i_op->put_link) + proc_dentry->d_inode->i_op->put_link(proc_dentry, nd, cookie); +} + static const struct inode_operations hppfs_dir_iops = { .lookup = hppfs_lookup, }; @@ -669,6 +674,7 @@ static const struct inode_operations hppfs_dir_iops = { static const struct inode_operations hppfs_link_iops = { .readlink = hppfs_readlink, .follow_link = hppfs_follow_link, + .put_link = hppfs_put_link, }; static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) diff --git a/fs/internal.h b/fs/internal.h index f67cd141d9a..e96a1667d74 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -85,3 +85,10 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); + +/* + * open.c + */ +struct nameidata; +extern struct file *nameidata_to_filp(struct nameidata *); +extern void release_open_intent(struct nameidata *); diff --git a/fs/jbd/Kconfig b/fs/jbd/Kconfig index a8408983abd..4e28beeed15 100644 --- a/fs/jbd/Kconfig +++ b/fs/jbd/Kconfig @@ -1,6 +1,5 @@ config JBD tristate - select FS_JOURNAL_INFO help This is a generic journalling layer for block devices. It is currently used by the ext3 file system, but it could also be diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 4160afad6d0..bd224eec9b0 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1913,7 +1913,7 @@ static void __init jbd_create_debugfs_entry(void) { jbd_debugfs_dir = debugfs_create_dir("jbd", NULL); if (jbd_debugfs_dir) - jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO, + jbd_debug = debugfs_create_u8("jbd-debug", S_IRUGO | S_IWUSR, jbd_debugfs_dir, &journal_enable_debug); } diff --git a/fs/jbd2/Kconfig b/fs/jbd2/Kconfig index 0f7d1ceafdf..f32f346f4b0 100644 --- a/fs/jbd2/Kconfig +++ b/fs/jbd2/Kconfig @@ -1,7 +1,6 @@ config JBD2 tristate select CRC32 - select FS_JOURNAL_INFO help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index ca0f5eb62b2..88684937095 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -22,6 +22,7 @@ #include <linux/jbd2.h> #include <linux/errno.h> #include <linux/slab.h> +#include <linux/blkdev.h> #include <trace/events/jbd2.h> /* @@ -515,6 +516,20 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; spin_unlock(&journal->j_state_lock); + + /* + * If there is an external journal, we need to make sure that + * any data blocks that were recently written out --- perhaps + * by jbd2_log_do_checkpoint() --- are flushed out before we + * drop the transactions from the external journal. It's + * unlikely this will be necessary, especially with a + * appropriately sized journal, but we need this to guarantee + * correctness. Fortunately jbd2_cleanup_journal_tail() + * doesn't get called all that often. + */ + if ((journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); if (!(journal->j_flags & JBD2_ABORT)) jbd2_journal_update_superblock(journal, 1); return 0; diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 6a10238d2c6..1bc74b6f26d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -259,6 +259,7 @@ static int journal_submit_data_buffers(journal_t *journal, ret = err; spin_lock(&journal->j_list_lock); J_ASSERT(jinode->i_transaction == commit_transaction); + commit_transaction->t_flushed_data_blocks = 1; jinode->i_flags &= ~JI_COMMIT_RUNNING; wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING); } @@ -708,8 +709,17 @@ start_journal_io: } } - /* Done it all: now write the commit record asynchronously. */ + /* + * If the journal is not located on the file system device, + * then we must flush the file system device before we issue + * the commit record + */ + if (commit_transaction->t_flushed_data_blocks && + (journal->j_fs_dev != journal->j_dev) && + (journal->j_flags & JBD2_BARRIER)) + blkdev_issue_flush(journal->j_fs_dev, NULL); + /* Done it all: now write the commit record asynchronously. */ if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT)) { err = journal_submit_commit_record(journal, commit_transaction, @@ -720,13 +730,6 @@ start_journal_io: blkdev_issue_flush(journal->j_dev, NULL); } - /* - * This is the right place to wait for data buffers both for ASYNC - * and !ASYNC commit. If commit is ASYNC, we need to wait only after - * the commit block went to disk (which happens above). If commit is - * SYNC, we need to wait for data buffers before we start writing - * commit block, which happens below in such setting. - */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index b7ca3a92a4d..ac0d027595d 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -814,7 +814,7 @@ static journal_t * journal_init_common (void) journal_t *journal; int err; - journal = kzalloc(sizeof(*journal), GFP_KERNEL|__GFP_NOFAIL); + journal = kzalloc(sizeof(*journal), GFP_KERNEL); if (!journal) goto fail; @@ -2115,7 +2115,8 @@ static void __init jbd2_create_debugfs_entry(void) { jbd2_debugfs_dir = debugfs_create_dir("jbd2", NULL); if (jbd2_debugfs_dir) - jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, S_IRUGO, + jbd2_debug = debugfs_create_u8(JBD2_DEBUG_NAME, + S_IRUGO | S_IWUSR, jbd2_debugfs_dir, &jbd2_journal_enable_debug); } diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 2234c73fc57..d929a822a74 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -524,7 +524,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) * Page cache is indexed by long. * I would use MAX_LFS_FILESIZE, but it's only half as big */ - sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, sb->s_maxbytes); + sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes); #endif sb->s_time_gran = 1; return 0; diff --git a/fs/namei.c b/fs/namei.c index dad4b80257d..d62fdc875f2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -37,8 +37,6 @@ #include "internal.h" -#define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE]) - /* [Feb-1997 T. Schoebel-Theuer] * Fundamental changes in the pathname lookup mechanisms (namei) * were necessary because of omirr. The reason is that omirr needs @@ -234,6 +232,7 @@ int generic_permission(struct inode *inode, int mask, /* * Searching includes executable on directories, else just read. */ + mask &= MAY_READ | MAY_WRITE | MAY_EXEC; if (mask == MAY_READ || (S_ISDIR(inode->i_mode) && !(mask & MAY_WRITE))) if (capable(CAP_DAC_READ_SEARCH)) return 0; @@ -562,6 +561,7 @@ static __always_inline int __do_follow_link(struct path *path, struct nameidata dget(dentry); } mntget(path->mnt); + nd->last_type = LAST_BIND; cookie = dentry->d_inode->i_op->follow_link(dentry, nd); error = PTR_ERR(cookie); if (!IS_ERR(cookie)) { @@ -1604,11 +1604,12 @@ struct file *do_filp_open(int dfd, const char *pathname, struct file *filp; struct nameidata nd; int error; - struct path path, save; + struct path path; struct dentry *dir; int count = 0; int will_truncate; int flag = open_to_namei_flags(open_flag); + int force_reval = 0; /* * O_SYNC is implemented as __O_SYNC|O_DSYNC. As many places only @@ -1620,7 +1621,7 @@ struct file *do_filp_open(int dfd, const char *pathname, open_flag |= O_DSYNC; if (!acc_mode) - acc_mode = MAY_OPEN | ACC_MODE(flag); + acc_mode = MAY_OPEN | ACC_MODE(open_flag); /* O_TRUNC implies we need access checks for write permissions */ if (flag & O_TRUNC) @@ -1640,6 +1641,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) return ERR_PTR(-ENFILE); nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = 0; error = do_path_lookup(dfd, pathname, @@ -1659,9 +1661,12 @@ struct file *do_filp_open(int dfd, const char *pathname, /* * Create - we need to know the parent. */ +reval: error = path_init(dfd, pathname, LOOKUP_PARENT, &nd); if (error) return ERR_PTR(error); + if (force_reval) + nd.flags |= LOOKUP_REVAL; error = path_walk(pathname, &nd); if (error) { if (nd.root.mnt) @@ -1685,6 +1690,7 @@ struct file *do_filp_open(int dfd, const char *pathname, if (filp == NULL) goto exit_parent; nd.intent.open.file = filp; + filp->f_flags = open_flag; nd.intent.open.flags = flag; nd.intent.open.create_mode = mode; dir = nd.path.dentry; @@ -1725,13 +1731,12 @@ do_last: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); + filp = nameidata_to_filp(&nd); mnt_drop_write(nd.path.mnt); if (nd.root.mnt) path_put(&nd.root); if (!IS_ERR(filp)) { - error = ima_path_check(&filp->f_path, filp->f_mode & - (MAY_READ | MAY_WRITE | MAY_EXEC)); + error = ima_file_check(filp, acc_mode); if (error) { fput(filp); filp = ERR_PTR(error); @@ -1789,10 +1794,9 @@ ok: mnt_drop_write(nd.path.mnt); goto exit; } - filp = nameidata_to_filp(&nd, open_flag); + filp = nameidata_to_filp(&nd); if (!IS_ERR(filp)) { - error = ima_path_check(&filp->f_path, filp->f_mode & - (MAY_READ | MAY_WRITE | MAY_EXEC)); + error = ima_file_check(filp, acc_mode); if (error) { fput(filp); filp = ERR_PTR(error); @@ -1852,17 +1856,7 @@ do_link: error = security_inode_follow_link(path.dentry, &nd); if (error) goto exit_dput; - save = nd.path; - path_get(&save); error = __do_follow_link(&path, &nd); - if (error == -ESTALE) { - /* nd.path had been dropped */ - nd.path = save; - path_get(&nd.path); - nd.flags |= LOOKUP_REVAL; - error = __do_follow_link(&path, &nd); - } - path_put(&save); path_put(&path); if (error) { /* Does someone understand code flow here? Or it is only @@ -1872,6 +1866,10 @@ do_link: release_open_intent(&nd); if (nd.root.mnt) path_put(&nd.root); + if (error == -ESTALE && !force_reval) { + force_reval = 1; + goto reval; + } return ERR_PTR(error); } nd.flags &= ~LOOKUP_PARENT; diff --git a/fs/namespace.c b/fs/namespace.c index faab1273281..c768f733c8d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -965,10 +965,12 @@ EXPORT_SYMBOL(may_umount_tree); int may_umount(struct vfsmount *mnt) { int ret = 1; + down_read(&namespace_sem); spin_lock(&vfsmount_lock); if (propagate_mount_busy(mnt, 2)) ret = 0; spin_unlock(&vfsmount_lock); + up_read(&namespace_sem); return ret; } @@ -1352,12 +1354,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, if (err) goto out_cleanup_ids; + spin_lock(&vfsmount_lock); + if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) set_mnt_shared(p); } - - spin_lock(&vfsmount_lock); if (parent_path) { detach_mnt(source_mnt, parent_path); attach_mnt(source_mnt, path); @@ -1534,8 +1536,12 @@ static int do_remount(struct path *path, int flags, int mnt_flags, err = change_mount_flags(path->mnt, flags); else err = do_remount_sb(sb, flags, data, 0); - if (!err) + if (!err) { + spin_lock(&vfsmount_lock); + mnt_flags |= path->mnt->mnt_flags & MNT_PNODE_MASK; path->mnt->mnt_flags = mnt_flags; + spin_unlock(&vfsmount_lock); + } up_write(&sb->s_umount); if (!err) { security_sb_post_remount(path->mnt, flags, data); @@ -1665,6 +1671,8 @@ int do_add_mount(struct vfsmount *newmnt, struct path *path, { int err; + mnt_flags &= ~(MNT_SHARED | MNT_WRITE_HOLD); + down_write(&namespace_sem); /* Something was mounted here while we slept */ while (d_mountpoint(path->dentry) && @@ -2068,7 +2076,7 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns, * create_mnt_ns - creates a private namespace and adds a root filesystem * @mnt: pointer to the new root filesystem mountpoint */ -static struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) +struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) { struct mnt_namespace *new_ns; @@ -2080,6 +2088,7 @@ static struct mnt_namespace *create_mnt_ns(struct vfsmount *mnt) } return new_ns; } +EXPORT_SYMBOL(create_mnt_ns); SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, char __user *, type, unsigned long, flags, void __user *, data) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2c5ace4f00a..3c7f03b669f 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1615,6 +1615,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; new_dentry = dentry; + rehash = NULL; new_inode = NULL; } } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6b891328f33..63f2071d644 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -486,6 +486,8 @@ static int nfs_release_page(struct page *page, gfp_t gfp) { dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); + if (gfp & __GFP_WAIT) + nfs_wb_page(page->mapping->host, page); /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) return 0; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index faa091865ad..f141bde7756 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1261,8 +1261,10 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) if (fattr->valid & NFS_ATTR_FATTR_MODE) { if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) { + umode_t newmode = inode->i_mode & S_IFMT; + newmode |= fattr->mode & S_IALLUGO; + inode->i_mode = newmode; invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; - inode->i_mode = fattr->mode; } } else if (server->caps & NFS_CAP_MODE) invalid |= save_cache_validity & (NFS_INO_INVALID_ATTR diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 865265bdca0..0c6fda33d66 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -146,6 +146,7 @@ enum { NFS_O_RDWR_STATE, /* OPEN stateid has read/write state */ NFS_STATE_RECLAIM_REBOOT, /* OPEN stateid server rebooted */ NFS_STATE_RECLAIM_NOGRACE, /* OPEN stateid needs to recover state */ + NFS_STATE_POSIX_LOCKS, /* Posix locks are supported */ }; struct nfs4_state { @@ -277,6 +278,7 @@ extern void nfs4_state_set_mode_locked(struct nfs4_state *, fmode_t); extern void nfs4_schedule_state_recovery(struct nfs_client *); extern void nfs4_schedule_state_manager(struct nfs_client *); extern int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state); +extern int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state); extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 198d51d17c1..375f0fae2c6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -249,19 +249,15 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, if (state == NULL) break; nfs4_state_mark_reclaim_nograce(clp, state); - case -NFS4ERR_STALE_CLIENTID: + goto do_state_recovery; case -NFS4ERR_STALE_STATEID: - case -NFS4ERR_EXPIRED: - nfs4_schedule_state_recovery(clp); - ret = nfs4_wait_clnt_recover(clp); - if (ret == 0) - exception->retry = 1; -#if !defined(CONFIG_NFS_V4_1) - break; -#else /* !defined(CONFIG_NFS_V4_1) */ - if (!nfs4_has_session(server->nfs_client)) + if (state == NULL) break; - /* FALLTHROUGH */ + nfs4_state_mark_reclaim_reboot(clp, state); + case -NFS4ERR_STALE_CLIENTID: + case -NFS4ERR_EXPIRED: + goto do_state_recovery; +#if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: case -NFS4ERR_BAD_HIGH_SLOT: @@ -274,7 +270,7 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, nfs4_schedule_state_recovery(clp); exception->retry = 1; break; -#endif /* !defined(CONFIG_NFS_V4_1) */ +#endif /* defined(CONFIG_NFS_V4_1) */ case -NFS4ERR_FILE_OPEN: if (exception->timeout > HZ) { /* We have retried a decent amount, time to @@ -293,6 +289,12 @@ static int nfs4_handle_exception(const struct nfs_server *server, int errorcode, } /* We failed to handle the error */ return nfs4_map_errors(ret); +do_state_recovery: + nfs4_schedule_state_recovery(clp); + ret = nfs4_wait_clnt_recover(clp); + if (ret == 0) + exception->retry = 1; + return ret; } @@ -1658,6 +1660,8 @@ static int _nfs4_do_open(struct inode *dir, struct path *path, fmode_t fmode, in status = PTR_ERR(state); if (IS_ERR(state)) goto err_opendata_put; + if ((opendata->o_res.rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) != 0) + set_bit(NFS_STATE_POSIX_LOCKS, &state->flags); nfs4_opendata_put(opendata); nfs4_put_state_owner(sp); *res = state; @@ -3422,15 +3426,14 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, if (state == NULL) break; nfs4_state_mark_reclaim_nograce(clp, state); - case -NFS4ERR_STALE_CLIENTID: + goto do_state_recovery; case -NFS4ERR_STALE_STATEID: + if (state == NULL) + break; + nfs4_state_mark_reclaim_reboot(clp, state); + case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_EXPIRED: - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); - nfs4_schedule_state_recovery(clp); - if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) - rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); - task->tk_status = 0; - return -EAGAIN; + goto do_state_recovery; #if defined(CONFIG_NFS_V4_1) case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -3458,6 +3461,13 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, } task->tk_status = nfs4_map_errors(task->tk_status); return 0; +do_state_recovery: + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); + nfs4_schedule_state_recovery(clp); + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) == 0) + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); + task->tk_status = 0; + return -EAGAIN; } static int @@ -4088,6 +4098,28 @@ static const struct rpc_call_ops nfs4_recover_lock_ops = { .rpc_release = nfs4_lock_release, }; +static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_state *lsp, int new_lock_owner, int error) +{ + struct nfs_client *clp = server->nfs_client; + struct nfs4_state *state = lsp->ls_state; + + switch (error) { + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_EXPIRED: + if (new_lock_owner != 0 || + (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + nfs4_state_mark_reclaim_nograce(clp, state); + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + break; + case -NFS4ERR_STALE_STATEID: + if (new_lock_owner != 0 || + (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) + nfs4_state_mark_reclaim_reboot(clp, state); + lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED; + }; +} + static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *fl, int recovery_type) { struct nfs4_lockdata *data; @@ -4126,6 +4158,9 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f ret = nfs4_wait_for_completion_rpc_task(task); if (ret == 0) { ret = data->rpc_status; + if (ret) + nfs4_handle_setlk_error(data->server, data->lsp, + data->arg.new_lock_owner, ret); } else data->cancelled = 1; rpc_put_task(task); @@ -4181,8 +4216,11 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock { struct nfs_inode *nfsi = NFS_I(state->inode); unsigned char fl_flags = request->fl_flags; - int status; + int status = -ENOLCK; + if ((fl_flags & FL_POSIX) && + !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags)) + goto out; /* Is this a delegated open? */ status = nfs4_set_lock_state(state, request); if (status != 0) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 6d263ed79e9..c1e2733f4fa 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -901,7 +901,7 @@ void nfs4_schedule_state_recovery(struct nfs_client *clp) nfs4_schedule_state_manager(clp); } -static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) +int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state) { set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags); diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index e2975939126..a12c45b65dd 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -176,6 +176,12 @@ void nfs_release_request(struct nfs_page *req) kref_put(&req->wb_kref, nfs_free_request); } +static int nfs_wait_bit_uninterruptible(void *word) +{ + io_schedule(); + return 0; +} + /** * nfs_wait_on_request - Wait for a request to complete. * @req: request to wait upon. @@ -186,14 +192,9 @@ void nfs_release_request(struct nfs_page *req) int nfs_wait_on_request(struct nfs_page *req) { - int ret = 0; - - if (!test_bit(PG_BUSY, &req->wb_flags)) - goto out; - ret = out_of_line_wait_on_bit(&req->wb_flags, PG_BUSY, - nfs_wait_bit_killable, TASK_KILLABLE); -out: - return ret; + return wait_on_bit(&req->wb_flags, PG_BUSY, + nfs_wait_bit_uninterruptible, + TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/nfs/super.c b/fs/nfs/super.c index d5b112bcf3d..f1afee4eea7 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -243,6 +243,7 @@ static int nfs_show_stats(struct seq_file *, struct vfsmount *); static int nfs_get_sb(struct file_system_type *, int, const char *, void *, struct vfsmount *); static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); +static void nfs_put_super(struct super_block *); static void nfs_kill_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); @@ -266,6 +267,7 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -335,6 +337,7 @@ static const struct super_operations nfs4_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, + .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs4_clear_inode, .umount_begin = nfs_umount_begin, @@ -2258,6 +2261,17 @@ error_splat_super: } /* + * Ensure that we unregister the bdi before kill_anon_super + * releases the device name + */ +static void nfs_put_super(struct super_block *s) +{ + struct nfs_server *server = NFS_SB(s); + + bdi_unregister(&server->backing_dev_info); +} + +/* * Destroy an NFS2/3 superblock */ static void nfs_kill_super(struct super_block *s) @@ -2265,7 +2279,6 @@ static void nfs_kill_super(struct super_block *s) struct nfs_server *server = NFS_SB(s); kill_anon_super(s); - bdi_unregister(&server->backing_dev_info); nfs_fscache_release_super_cookie(s); nfs_free_server(server); } @@ -2648,13 +2661,21 @@ out_freepage: static int nfs_follow_remote_path(struct vfsmount *root_mnt, const char *export_path, struct vfsmount *mnt_target) { + struct mnt_namespace *ns_private; struct nameidata nd; struct super_block *s; int ret; + ns_private = create_mnt_ns(root_mnt); + ret = PTR_ERR(ns_private); + if (IS_ERR(ns_private)) + goto out_mntput; + ret = vfs_path_lookup(root_mnt->mnt_root, root_mnt, export_path, LOOKUP_FOLLOW, &nd); + put_mnt_ns(ns_private); + if (ret != 0) goto out_err; diff --git a/fs/nfs/sysctl.c b/fs/nfs/sysctl.c index 70e1fbbaaea..ad4d2e787b2 100644 --- a/fs/nfs/sysctl.c +++ b/fs/nfs/sysctl.c @@ -15,8 +15,10 @@ #include "callback.h" +#ifdef CONFIG_NFS_V4 static const int nfs_set_port_min = 0; static const int nfs_set_port_max = 65535; +#endif static struct ctl_table_header *nfs_callback_sysctl_table; static ctl_table nfs_cb_sysctls[] = { diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d171696017f..7b54b8bb101 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1233,7 +1233,7 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -void nfs_commitdata_release(void *data) +static void nfs_commitdata_release(void *data) { struct nfs_write_data *wdata = data; @@ -1541,6 +1541,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) break; } ret = nfs_wait_on_request(req); + nfs_release_request(req); if (ret < 0) goto out; } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 1c12177b908..55c8e63af0b 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -89,7 +89,7 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, int flags = nfsexp_flags(rqstp, exp); /* Check if the request originated from a secure port. */ - if (!rqstp->rq_secure && (flags & NFSEXP_INSECURE_PORT)) { + if (!rqstp->rq_secure && !(flags & NFSEXP_INSECURE_PORT)) { RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); dprintk(KERN_WARNING "nfsd: request from insecure port %s!\n", diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 7c2e337d05a..97d79eff6b7 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -752,6 +752,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, flags, current_cred()); if (IS_ERR(*filp)) host_err = PTR_ERR(*filp); + host_err = ima_file_check(*filp, access); out_nfserr: err = nfserrno(host_err); out: @@ -780,12 +781,9 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp, int (*fsync) (struct file *, struct dentry *, int); int err; - err = filemap_fdatawrite(inode->i_mapping); + err = filemap_write_and_wait(inode->i_mapping); if (err == 0 && fop && (fsync = fop->fsync)) err = fsync(filp, dp, 0); - if (err == 0) - err = filemap_fdatawait(inode->i_mapping); - return err; } @@ -2130,7 +2128,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, */ path.mnt = exp->ex_path.mnt; path.dentry = dentry; - err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC)); nfsd_out: return err? nfserrno(err) : 0; } diff --git a/fs/nilfs2/Kconfig b/fs/nilfs2/Kconfig index 1225af7b216..251da07b2a1 100644 --- a/fs/nilfs2/Kconfig +++ b/fs/nilfs2/Kconfig @@ -2,7 +2,6 @@ config NILFS2_FS tristate "NILFS2 file system support (EXPERIMENTAL)" depends on EXPERIMENTAL select CRC32 - select FS_JOURNAL_INFO help NILFS2 is a log-structured file system (LFS) supporting continuous snapshotting. In addition to versioning capability of the entire diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index f4a14ea2ed9..effdbdbe6c1 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -417,8 +417,8 @@ __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap, key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT - bmap->b_inode->i_blkbits); - for (pbh = page_buffers(bh->b_page); pbh != bh; - pbh = pbh->b_this_page, key++); + for (pbh = page_buffers(bh->b_page); pbh != bh; pbh = pbh->b_this_page) + key++; return key; } diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c index d5ad54e204a..18737818db6 100644 --- a/fs/nilfs2/cpfile.c +++ b/fs/nilfs2/cpfile.c @@ -328,19 +328,24 @@ int nilfs_cpfile_delete_checkpoints(struct inode *cpfile, tnicps += nicps; nilfs_mdt_mark_buffer_dirty(cp_bh); nilfs_mdt_mark_dirty(cpfile); - if (!nilfs_cpfile_is_in_first(cpfile, cno) && - (count = nilfs_cpfile_block_sub_valid_checkpoints( - cpfile, cp_bh, kaddr, nicps)) == 0) { - /* make hole */ - kunmap_atomic(kaddr, KM_USER0); - brelse(cp_bh); - ret = nilfs_cpfile_delete_checkpoint_block( - cpfile, cno); - if (ret == 0) - continue; - printk(KERN_ERR "%s: cannot delete block\n", - __func__); - break; + if (!nilfs_cpfile_is_in_first(cpfile, cno)) { + count = + nilfs_cpfile_block_sub_valid_checkpoints( + cpfile, cp_bh, kaddr, nicps); + if (count == 0) { + /* make hole */ + kunmap_atomic(kaddr, KM_USER0); + brelse(cp_bh); + ret = + nilfs_cpfile_delete_checkpoint_block( + cpfile, cno); + if (ret == 0) + continue; + printk(KERN_ERR + "%s: cannot delete block\n", + __func__); + break; + } } } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index d369ac71827..236753df5cd 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -51,11 +51,11 @@ static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, struct nilfs_direct *direct; __u64 ptr; - direct = (struct nilfs_direct *)bmap; - if ((key > NILFS_DIRECT_KEY_MAX) || - (level != 1) || /* XXX: use macro for level 1 */ - ((ptr = nilfs_direct_get_ptr(direct, key)) == - NILFS_BMAP_INVALID_PTR)) + direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */ + if (key > NILFS_DIRECT_KEY_MAX || level != 1) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; if (ptrp != NULL) @@ -73,9 +73,10 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, sector_t blocknr; int ret, cnt; - if (key > NILFS_DIRECT_KEY_MAX || - (ptr = nilfs_direct_get_ptr(direct, key)) == - NILFS_BMAP_INVALID_PTR) + if (key > NILFS_DIRECT_KEY_MAX) + return -ENOENT; + ptr = nilfs_direct_get_ptr(direct, key); + if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; if (NILFS_BMAP_USE_VBN(bmap)) { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index f6af76042d8..d6b2b83de36 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -480,7 +480,7 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, unsigned int cmd, void __user *argp) { struct nilfs_argv argv[5]; - const static size_t argsz[5] = { + static const size_t argsz[5] = { sizeof(struct nilfs_vdesc), sizeof(struct nilfs_period), sizeof(__u64), diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 17584c52448..105b508b47a 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2829,7 +2829,7 @@ static void nilfs_segctor_destroy(struct nilfs_sc_info *sci) || sci->sc_seq_request != sci->sc_seq_done); spin_unlock(&sci->sc_state_lock); - if (flag || nilfs_segctor_confirm(sci)) + if (flag || !nilfs_segctor_confirm(sci)) nilfs_segctor_write_out(sci); WARN_ON(!list_empty(&sci->sc_copied_buffers)); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index c9ee67b442e..1afb0a10229 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -121,7 +121,7 @@ static int idr_callback(int id, void *p, void *data) if (warned) return 0; - warned = false; + warned = true; entry = p; ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 8271cf05c95..a94e8bd8eb1 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -552,7 +552,7 @@ retry: spin_lock(&group->inotify_data.idr_lock); ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd, + group->inotify_data.last_wd+1, &tmp_ientry->wd); spin_unlock(&group->inotify_data.idr_lock); if (ret) { @@ -632,7 +632,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 1; + group->inotify_data.last_wd = 0; group->inotify_data.user = user; group->inotify_data.fa = NULL; diff --git a/fs/ocfs2/Kconfig b/fs/ocfs2/Kconfig index 701b7a3a872..0d840669698 100644 --- a/fs/ocfs2/Kconfig +++ b/fs/ocfs2/Kconfig @@ -6,6 +6,7 @@ config OCFS2_FS select CRC32 select QUOTA select QUOTA_TREE + select FS_POSIX_ACL help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -74,12 +75,3 @@ config OCFS2_DEBUG_FS This option will enable expensive consistency checks. Enable this option for debugging only as it is likely to decrease performance of the filesystem. - -config OCFS2_FS_POSIX_ACL - bool "OCFS2 POSIX Access Control Lists" - depends on OCFS2_FS - select FS_POSIX_ACL - default n - help - Posix Access Control Lists (ACLs) support permissions for users and - groups beyond the owner/group/world scheme. diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 31f25ce32c9..600d2d2ade1 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -39,11 +39,8 @@ ocfs2-objs := \ ver.o \ quota_local.o \ quota_global.o \ - xattr.o - -ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y) -ocfs2-objs += acl.o -endif + xattr.o \ + acl.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index e3e47415d85..0501974bedd 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -98,15 +98,11 @@ static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, int type, struct buffer_head *di_bh) { - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int name_index; char *value = NULL; struct posix_acl *acl; int retval; - if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) - return NULL; - switch (type) { case ACL_TYPE_ACCESS: name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h index 8f6389ed4da..5c5d31f0585 100644 --- a/fs/ocfs2/acl.h +++ b/fs/ocfs2/acl.h @@ -26,8 +26,6 @@ struct ocfs2_acl_entry { __le32 e_id; }; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL - extern int ocfs2_check_acl(struct inode *, int); extern int ocfs2_acl_chmod(struct inode *); extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, @@ -35,24 +33,4 @@ extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, struct ocfs2_alloc_context *, struct ocfs2_alloc_context *); -#else /* CONFIG_OCFS2_FS_POSIX_ACL*/ - -#define ocfs2_check_acl NULL -static inline int ocfs2_acl_chmod(struct inode *inode) -{ - return 0; -} -static inline int ocfs2_init_acl(handle_t *handle, - struct inode *inode, - struct inode *dir, - struct buffer_head *di_bh, - struct buffer_head *dir_bh, - struct ocfs2_alloc_context *meta_ac, - struct ocfs2_alloc_context *data_ac) -{ - return 0; -} - -#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/ - #endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index fb4e672579b..d17bdc718f7 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1765,9 +1765,9 @@ set_and_inc: * * The array index of the subtree root is passed back. */ -static int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, - struct ocfs2_path *left, - struct ocfs2_path *right) +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right) { int i = 0; @@ -2872,8 +2872,8 @@ out: * This looks similar, but is subtly different to * ocfs2_find_cpos_for_left_leaf(). */ -static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, - struct ocfs2_path *path, u32 *cpos) +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos) { int i, j, ret = 0; u64 blkno; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 9c122d57446..1db4359ccb9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -317,4 +317,9 @@ int ocfs2_path_bh_journal_access(handle_t *handle, int ocfs2_journal_access_path(struct ocfs2_caching_info *ci, handle_t *handle, struct ocfs2_path *path); +int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, + struct ocfs2_path *path, u32 *cpos); +int ocfs2_find_subtree_root(struct ocfs2_extent_tree *et, + struct ocfs2_path *left, + struct ocfs2_path *right); #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index c452d116b89..eda5b8bcddd 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -176,7 +176,8 @@ static void o2hb_write_timeout(struct work_struct *work) static void o2hb_arm_write_timeout(struct o2hb_region *reg) { - mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS); + mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", + O2HB_MAX_WRITE_TIMEOUT_MS); cancel_delayed_work(®->hr_write_timeout_work); reg->hr_last_timeout_start = jiffies; @@ -874,7 +875,8 @@ static int o2hb_thread(void *data) do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); - mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", + mlog(ML_HEARTBEAT, + "start = %lu.%lu, end = %lu.%lu, msec = %u\n", before_hb.tv_sec, (unsigned long) before_hb.tv_usec, after_hb.tv_sec, (unsigned long) after_hb.tv_usec, elapsed_msec); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 7ee6188bc79..c81142e3ef8 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -35,6 +35,10 @@ * cluster references throughout where nodes are looked up */ struct o2nm_cluster *o2nm_single_cluster = NULL; +char *o2nm_fence_method_desc[O2NM_FENCE_METHODS] = { + "reset", /* O2NM_FENCE_RESET */ + "panic", /* O2NM_FENCE_PANIC */ +}; struct o2nm_node *o2nm_get_node_by_num(u8 node_num) { @@ -579,6 +583,43 @@ static ssize_t o2nm_cluster_attr_reconnect_delay_ms_write( return o2nm_cluster_attr_write(page, count, &cluster->cl_reconnect_delay_ms); } + +static ssize_t o2nm_cluster_attr_fence_method_read( + struct o2nm_cluster *cluster, char *page) +{ + ssize_t ret = 0; + + if (cluster) + ret = sprintf(page, "%s\n", + o2nm_fence_method_desc[cluster->cl_fence_method]); + return ret; +} + +static ssize_t o2nm_cluster_attr_fence_method_write( + struct o2nm_cluster *cluster, const char *page, size_t count) +{ + unsigned int i; + + if (page[count - 1] != '\n') + goto bail; + + for (i = 0; i < O2NM_FENCE_METHODS; ++i) { + if (count != strlen(o2nm_fence_method_desc[i]) + 1) + continue; + if (strncasecmp(page, o2nm_fence_method_desc[i], count - 1)) + continue; + if (cluster->cl_fence_method != i) { + printk(KERN_INFO "ocfs2: Changing fence method to %s\n", + o2nm_fence_method_desc[i]); + cluster->cl_fence_method = i; + } + return count; + } + +bail: + return -EINVAL; +} + static struct o2nm_cluster_attribute o2nm_cluster_attr_idle_timeout_ms = { .attr = { .ca_owner = THIS_MODULE, .ca_name = "idle_timeout_ms", @@ -603,10 +644,19 @@ static struct o2nm_cluster_attribute o2nm_cluster_attr_reconnect_delay_ms = { .store = o2nm_cluster_attr_reconnect_delay_ms_write, }; +static struct o2nm_cluster_attribute o2nm_cluster_attr_fence_method = { + .attr = { .ca_owner = THIS_MODULE, + .ca_name = "fence_method", + .ca_mode = S_IRUGO | S_IWUSR }, + .show = o2nm_cluster_attr_fence_method_read, + .store = o2nm_cluster_attr_fence_method_write, +}; + static struct configfs_attribute *o2nm_cluster_attrs[] = { &o2nm_cluster_attr_idle_timeout_ms.attr, &o2nm_cluster_attr_keepalive_delay_ms.attr, &o2nm_cluster_attr_reconnect_delay_ms.attr, + &o2nm_cluster_attr_fence_method.attr, NULL, }; static ssize_t o2nm_cluster_show(struct config_item *item, @@ -778,6 +828,7 @@ static struct config_group *o2nm_cluster_group_make_group(struct config_group *g cluster->cl_reconnect_delay_ms = O2NET_RECONNECT_DELAY_MS_DEFAULT; cluster->cl_idle_timeout_ms = O2NET_IDLE_TIMEOUT_MS_DEFAULT; cluster->cl_keepalive_delay_ms = O2NET_KEEPALIVE_DELAY_MS_DEFAULT; + cluster->cl_fence_method = O2NM_FENCE_RESET; ret = &cluster->cl_group; o2nm_single_cluster = cluster; diff --git a/fs/ocfs2/cluster/nodemanager.h b/fs/ocfs2/cluster/nodemanager.h index c992ea0da4a..09ea2d388bb 100644 --- a/fs/ocfs2/cluster/nodemanager.h +++ b/fs/ocfs2/cluster/nodemanager.h @@ -33,6 +33,12 @@ #include <linux/configfs.h> #include <linux/rbtree.h> +enum o2nm_fence_method { + O2NM_FENCE_RESET = 0, + O2NM_FENCE_PANIC, + O2NM_FENCE_METHODS, /* Number of fence methods */ +}; + struct o2nm_node { spinlock_t nd_lock; struct config_item nd_item; @@ -58,6 +64,7 @@ struct o2nm_cluster { unsigned int cl_idle_timeout_ms; unsigned int cl_keepalive_delay_ms; unsigned int cl_reconnect_delay_ms; + enum o2nm_fence_method cl_fence_method; /* this bitmap is part of a hack for disk bitmap.. will go eventually. - zab */ unsigned long cl_nodes_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index bbacf7da48a..639024033fc 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -74,8 +74,20 @@ static void o2quo_fence_self(void) * threads can still schedule, etc, etc */ o2hb_stop_all_regions(); - printk("ocfs2 is very sorry to be fencing this system by restarting\n"); - emergency_restart(); + switch (o2nm_single_cluster->cl_fence_method) { + case O2NM_FENCE_PANIC: + panic("*** ocfs2 is very sorry to be fencing this system by " + "panicing ***\n"); + break; + default: + WARN_ON(o2nm_single_cluster->cl_fence_method >= + O2NM_FENCE_METHODS); + case O2NM_FENCE_RESET: + printk(KERN_ERR "*** ocfs2 is very sorry to be fencing this " + "system by restarting ***\n"); + emergency_restart(); + break; + }; } /* Indicate that a timeout occured on a hearbeat region write. The diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index d9fa3d22e17..2f9e4e19a4f 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -2589,6 +2589,14 @@ retry: "begin reco msg (%d)\n", dlm->name, nodenum, ret); ret = 0; } + if (ret == -EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + msleep(100); + goto retry; + } if (ret < 0) { struct dlm_lock_resource *res; /* this is now a serious problem, possibly ENOMEM @@ -2608,14 +2616,6 @@ retry: * another ENOMEM */ msleep(100); goto retry; - } else if (ret == EAGAIN) { - mlog(0, "%s: trying to start recovery of node " - "%u, but node %u is waiting for last recovery " - "to complete, backoff for a bit\n", dlm->name, - dead_node, nodenum); - /* TODO Look into replacing msleep with cond_resched() */ - msleep(100); - goto retry; } } @@ -2639,7 +2639,7 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data, dlm->name, br->node_idx, br->dead_node, dlm->reco.dead_node, dlm->reco.new_master); spin_unlock(&dlm->spinlock); - return EAGAIN; + return -EAGAIN; } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 843db64e9d4..d35a27f4523 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -37,6 +37,7 @@ #include "extent_map.h" #include "inode.h" #include "super.h" +#include "symlink.h" #include "buffer_head_io.h" @@ -703,6 +704,12 @@ out: return ret; } +/* + * The ocfs2_fiemap_inline() may be a little bit misleading, since + * it not only handles the fiemap for inlined files, but also deals + * with the fast symlink, cause they have no difference for extent + * mapping per se. + */ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct fiemap_extent_info *fieinfo, u64 map_start) @@ -715,11 +722,18 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, struct ocfs2_inode_info *oi = OCFS2_I(inode); di = (struct ocfs2_dinode *)di_bh->b_data; - id_count = le16_to_cpu(di->id2.i_data.id_count); + if (ocfs2_inode_is_fast_symlink(inode)) + id_count = ocfs2_fast_symlink_chars(inode->i_sb); + else + id_count = le16_to_cpu(di->id2.i_data.id_count); if (map_start < id_count) { phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; - phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + if (ocfs2_inode_is_fast_symlink(inode)) + phys += offsetof(struct ocfs2_dinode, id2.i_symlink); + else + phys += offsetof(struct ocfs2_dinode, + id2.i_data.id_data); ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, flags); @@ -756,9 +770,10 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, down_read(&OCFS2_I(inode)->ip_alloc_sem); /* - * Handle inline-data separately. + * Handle inline-data and fast symlink separately. */ - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if ((OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || + ocfs2_inode_is_fast_symlink(inode)) { ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); goto out_unlock; } @@ -786,6 +801,8 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fe_flags = 0; if (rec.e_flags & OCFS2_EXT_UNWRITTEN) fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + fe_flags |= FIEMAP_EXTENT_SHARED; if (is_last) fe_flags |= FIEMAP_EXTENT_LAST; len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 3d30a1c974a..06ccf6a86d3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1772,7 +1772,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, loff_t *ppos, size_t count, int appending, - int *direct_io) + int *direct_io, + int *has_refcount) { int ret = 0, meta_level = 0; struct inode *inode = dentry->d_inode; @@ -1833,6 +1834,8 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, saved_pos, count, &meta_level); + if (has_refcount) + *has_refcount = 1; } if (ret < 0) { @@ -1856,6 +1859,10 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry, break; } + if (has_refcount && *has_refcount == 1) { + *direct_io = 0; + break; + } /* * Allowing concurrent direct writes means * i_size changes wouldn't be synchronized, so @@ -1899,7 +1906,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, loff_t pos) { int ret, direct_io, appending, rw_level, have_alloc_sem = 0; - int can_do_direct; + int can_do_direct, has_refcount = 0; ssize_t written = 0; size_t ocount; /* original count */ size_t count; /* after file limit checks */ @@ -1942,7 +1949,7 @@ relock: can_do_direct = direct_io; ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, iocb->ki_left, appending, - &can_do_direct); + &can_do_direct, &has_refcount); if (ret < 0) { mlog_errno(ret); goto out; @@ -2006,14 +2013,16 @@ out_dio: /* buffered aio wouldn't have proper lock coverage today */ BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); - if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode)) { + if ((file->f_flags & O_DSYNC && !direct_io) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT && has_refcount)) { ret = filemap_fdatawrite_range(file->f_mapping, pos, pos + count - 1); if (ret < 0) written = ret; if (!ret && (old_size != i_size_read(inode) || - old_clusters != OCFS2_I(inode)->ip_clusters)) { + old_clusters != OCFS2_I(inode)->ip_clusters || + has_refcount)) { ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; @@ -2062,7 +2071,7 @@ static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, int ret; ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, &sd->pos, - sd->total_len, 0, NULL); + sd->total_len, 0, NULL, NULL); if (ret < 0) { mlog_errno(ret); return ret; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index f010b22b1c4..50fb26a6a5f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2108,6 +2108,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } did_quota_inode = 1; + inode->i_nlink = 0; /* do the real work now. */ status = ocfs2_mknod_locked(osb, dir, inode, 0, &new_di_bh, parent_di_bh, handle, @@ -2136,6 +2137,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, if (status < 0) mlog_errno(status); + insert_inode_hash(inode); leave: if (status < 0 && did_quota_inode) vfs_dq_free_inode(inode); @@ -2267,6 +2269,8 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, di = (struct ocfs2_dinode *)di_bh->b_data; le32_add_cpu(&di->i_flags, -OCFS2_ORPHANED_FL); di->i_orphaned_slot = 0; + inode->i_nlink = 1; + ocfs2_set_links_count(di, inode->i_nlink); ocfs2_journal_dirty(handle, di_bh); status = ocfs2_add_entry(handle, dentry, inode, @@ -2284,7 +2288,6 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto out_commit; } - insert_inode_hash(inode); dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); status = 0; @@ -2326,4 +2329,5 @@ const struct inode_operations ocfs2_dir_iops = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index d963d863870..9362eea7424 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -245,9 +245,11 @@ enum ocfs2_mount_options OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ - OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ - OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ - OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* Force POSIX access control lists */ + OCFS2_MOUNT_NO_POSIX_ACL = 1 << 9, /* Disable POSIX access + control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 10, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 11, /* We support group quotas */ }; #define OCFS2_OSB_SOFT_RO 0x0001 diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index e9431e4a5e7..1a1a679e51b 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1202,7 +1202,7 @@ struct ocfs2_local_disk_dqinfo { /* Header of one chunk of a quota file */ struct ocfs2_local_disk_chunk { __le32 dqc_free; /* Number of free entries in the bitmap */ - u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + __u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding * chunk of quota file */ }; diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 30967e3f5e4..74db2be75dd 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -276,7 +276,7 @@ static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } -void ocfs2_kref_remove_refcount_tree(struct kref *kref) +static void ocfs2_kref_remove_refcount_tree(struct kref *kref) { struct ocfs2_refcount_tree *tree = container_of(kref, struct ocfs2_refcount_tree, rf_getcnt); @@ -524,23 +524,6 @@ out: return ret; } -int ocfs2_lock_refcount_tree_by_inode(struct inode *inode, int rw, - struct ocfs2_refcount_tree **ret_tree, - struct buffer_head **ref_bh) -{ - int ret; - u64 ref_blkno; - - ret = ocfs2_get_refcount_block(inode, &ref_blkno); - if (ret) { - mlog_errno(ret); - return ret; - } - - return ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, - rw, ret_tree, ref_bh); -} - void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb, struct ocfs2_refcount_tree *tree, int rw) { @@ -969,6 +952,103 @@ out: } /* + * Find the end range for a leaf refcount block indicated by + * el->l_recs[index].e_blkno. + */ +static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci, + struct buffer_head *ref_root_bh, + struct ocfs2_extent_block *eb, + struct ocfs2_extent_list *el, + int index, u32 *cpos_end) +{ + int ret, i, subtree_root; + u32 cpos; + u64 blkno; + struct super_block *sb = ocfs2_metadata_cache_get_super(ci); + struct ocfs2_path *left_path = NULL, *right_path = NULL; + struct ocfs2_extent_tree et; + struct ocfs2_extent_list *tmp_el; + + if (index < le16_to_cpu(el->l_next_free_rec) - 1) { + /* + * We have a extent rec after index, so just use the e_cpos + * of the next extent rec. + */ + *cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos); + return 0; + } + + if (!eb || (eb && !eb->h_next_leaf_blk)) { + /* + * We are the last extent rec, so any high cpos should + * be stored in this leaf refcount block. + */ + *cpos_end = UINT_MAX; + return 0; + } + + /* + * If the extent block isn't the last one, we have to find + * the subtree root between this extent block and the next + * leaf extent block and get the corresponding e_cpos from + * the subroot. Otherwise we may corrupt the b-tree. + */ + ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh); + + left_path = ocfs2_new_path_from_et(&et); + if (!left_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos); + ret = ocfs2_find_path(ci, left_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + right_path = ocfs2_new_path_from_path(left_path); + if (!right_path) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_find_path(ci, right_path, cpos); + if (ret) { + mlog_errno(ret); + goto out; + } + + subtree_root = ocfs2_find_subtree_root(&et, left_path, + right_path); + + tmp_el = left_path->p_node[subtree_root].el; + blkno = left_path->p_node[subtree_root+1].bh->b_blocknr; + for (i = 0; i < le32_to_cpu(tmp_el->l_next_free_rec); i++) { + if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) { + *cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos); + break; + } + } + + BUG_ON(i == le32_to_cpu(tmp_el->l_next_free_rec)); + +out: + ocfs2_free_path(left_path); + ocfs2_free_path(right_path); + return ret; +} + +/* * Given a cpos and len, try to find the refcount record which contains cpos. * 1. If cpos can be found in one refcount record, return the record. * 2. If cpos can't be found, return a fake record which start from cpos @@ -983,10 +1063,10 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, struct buffer_head **ret_bh) { int ret = 0, i, found; - u32 low_cpos; + u32 low_cpos, uninitialized_var(cpos_end); struct ocfs2_extent_list *el; - struct ocfs2_extent_rec *tmp, *rec = NULL; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + struct ocfs2_extent_block *eb = NULL; struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(ci); struct ocfs2_refcount_block *rb = @@ -1034,12 +1114,16 @@ static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci, } } - /* adjust len when we have ocfs2_extent_rec after it. */ - if (found && i < le16_to_cpu(el->l_next_free_rec) - 1) { - tmp = &el->l_recs[i+1]; + if (found) { + ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh, + eb, el, i, &cpos_end); + if (ret) { + mlog_errno(ret); + goto out; + } - if (le32_to_cpu(tmp->e_cpos) < cpos + len) - len = le32_to_cpu(tmp->e_cpos) - cpos; + if (cpos_end < low_cpos + len) + len = cpos_end - low_cpos; } ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno), @@ -1418,7 +1502,7 @@ static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh, /* change old and new rl_used accordingly. */ le16_add_cpu(&rl->rl_used, -num_moved); - new_rl->rl_used = cpu_to_le32(num_moved); + new_rl->rl_used = cpu_to_le16(num_moved); sort(&rl->rl_recs, le16_to_cpu(rl->rl_used), sizeof(struct ocfs2_refcount_rec), @@ -1797,7 +1881,8 @@ static int ocfs2_split_refcount_rec(handle_t *handle, recs_need++; /* If the leaf block don't have enough record, expand it. */ - if (le16_to_cpu(rf_list->rl_used) + recs_need > rf_list->rl_count) { + if (le16_to_cpu(rf_list->rl_used) + recs_need > + le16_to_cpu(rf_list->rl_count)) { struct ocfs2_refcount_rec tmp_rec; u64 cpos = le64_to_cpu(orig_rec->r_cpos); len = le32_to_cpu(orig_rec->r_clusters); @@ -1859,7 +1944,7 @@ static int ocfs2_split_refcount_rec(handle_t *handle, memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec)); le64_add_cpu(&tail_rec->r_cpos, le32_to_cpu(tail_rec->r_clusters) - len); - tail_rec->r_clusters = le32_to_cpu(len); + tail_rec->r_clusters = cpu_to_le32(len); } /* @@ -3840,8 +3925,7 @@ static int ocfs2_add_refcounted_extent(struct inode *inode, } ret = ocfs2_insert_extent(handle, et, cpos, - cpu_to_le64(ocfs2_clusters_to_blocks(inode->i_sb, - p_cluster)), + ocfs2_clusters_to_blocks(inode->i_sb, p_cluster), num_clusters, ext_flags, meta_ac); if (ret) { mlog_errno(ret); @@ -4253,8 +4337,8 @@ static int ocfs2_user_path_parent(const char __user *path, * @new_dentry: target dentry * @preserve: if true, preserve all file attributes */ -int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, - struct dentry *new_dentry, bool preserve) +static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool preserve) { struct inode *inode = old_dentry->d_inode; int error; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index ff4c798a563..da78a2a334f 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -814,7 +814,7 @@ static int fs_protocol_compare(struct ocfs2_protocol_version *existing, static int user_cluster_connect(struct ocfs2_cluster_connection *conn) { dlm_lockspace_t *fsdlm; - struct ocfs2_live_connection *control; + struct ocfs2_live_connection *uninitialized_var(control); int rc = 0; BUG_ON(conn == NULL); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 14f47d2bfe0..26069917a9f 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -100,6 +100,8 @@ struct mount_options static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, int is_remount); +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options); static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt); static void ocfs2_put_super(struct super_block *sb); static int ocfs2_mount_volume(struct super_block *sb); @@ -600,7 +602,8 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) lock_kernel(); - if (!ocfs2_parse_options(sb, data, &parsed_options, 1)) { + if (!ocfs2_parse_options(sb, data, &parsed_options, 1) || + !ocfs2_check_set_options(sb, &parsed_options)) { ret = -EINVAL; goto out; } @@ -691,8 +694,6 @@ unlock_osb: if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; @@ -701,6 +702,10 @@ unlock_osb: if (!ocfs2_is_hard_readonly(osb)) ocfs2_set_journal_params(osb); + + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? + MS_POSIXACL : 0); } out: unlock_kernel(); @@ -1011,31 +1016,16 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); bh = NULL; - if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) - parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; - + if (!ocfs2_check_set_options(sb, &parsed_options)) { + status = -EINVAL; + goto read_super_error; + } osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; - if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "User quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } - if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA && - !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - status = -EINVAL; - mlog(ML_ERROR, "Group quotas were requested, but this " - "filesystem does not have the feature enabled.\n"); - goto read_super_error; - } status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -1245,6 +1235,40 @@ static struct file_system_type ocfs2_fs_type = { .next = NULL }; +static int ocfs2_check_set_options(struct super_block *sb, + struct mount_options *options) +{ + if (options->mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + return 0; + } + if (options->mount_opt & OCFS2_MOUNT_POSIX_ACL && + !OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) { + mlog(ML_ERROR, "ACL support requested but extended attributes " + "feature is not enabled\n"); + return 0; + } + /* No ACL setting specified? Use XATTR feature... */ + if (!(options->mount_opt & (OCFS2_MOUNT_POSIX_ACL | + OCFS2_MOUNT_NO_POSIX_ACL))) { + if (OCFS2_HAS_INCOMPAT_FEATURE(sb, OCFS2_FEATURE_INCOMPAT_XATTR)) + options->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + else + options->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; + } + return 1; +} + static int ocfs2_parse_options(struct super_block *sb, char *options, struct mount_options *mopt, @@ -1392,40 +1416,19 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; case Opt_usrquota: - /* We check only on remount, otherwise features - * aren't yet initialized. */ - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { - mlog(ML_ERROR, "User quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; break; case Opt_grpquota: - if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { - mlog(ML_ERROR, "Group quota requested but " - "filesystem feature is not set\n"); - status = 0; - goto bail; - } mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; break; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL case Opt_acl: mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + mopt->mount_opt &= ~OCFS2_MOUNT_NO_POSIX_ACL; break; case Opt_noacl: + mopt->mount_opt |= OCFS2_MOUNT_NO_POSIX_ACL; mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; break; -#else - case Opt_acl: - case Opt_noacl: - printk(KERN_INFO "ocfs2 (no)acl options not supported\n"); - break; -#endif default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1502,12 +1505,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); -#ifdef CONFIG_OCFS2_FS_POSIX_ACL if (opts & OCFS2_MOUNT_POSIX_ACL) seq_printf(s, ",acl"); else seq_printf(s, ",noacl"); -#endif return 0; } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index e3421030a69..49b133ccbf1 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -163,6 +163,7 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, @@ -174,4 +175,5 @@ const struct inode_operations ocfs2_fast_symlink_inode_operations = { .getxattr = generic_getxattr, .listxattr = ocfs2_listxattr, .removexattr = generic_removexattr, + .fiemap = ocfs2_fiemap, }; diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 43c114831c0..8fc6fb071c6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -98,10 +98,8 @@ static struct ocfs2_xattr_def_value_root def_xv = { struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL &ocfs2_xattr_acl_access_handler, &ocfs2_xattr_acl_default_handler, -#endif &ocfs2_xattr_trusted_handler, &ocfs2_xattr_security_handler, NULL @@ -109,12 +107,10 @@ struct xattr_handler *ocfs2_xattr_handlers[] = { static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, -#ifdef CONFIG_OCFS2_FS_POSIX_ACL [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] = &ocfs2_xattr_acl_access_handler, [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ocfs2_xattr_acl_default_handler, -#endif [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; @@ -6064,7 +6060,7 @@ static int ocfs2_value_metas_in_xattr_header(struct super_block *sb, * to the extent block, so just calculate a maximum record num. */ if (!xv->xr_list.l_tree_depth) - *num_recs += xv->xr_list.l_next_free_rec; + *num_recs += le16_to_cpu(xv->xr_list.l_next_free_rec); else *num_recs += ocfs2_clusters_for_bytes(sb, XATTR_SIZE_MAX); diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 08e36389f56..abd72a47f52 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -40,10 +40,8 @@ struct ocfs2_security_xattr_info { extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; extern struct xattr_handler ocfs2_xattr_security_handler; -#ifdef CONFIG_OCFS2_FS_POSIX_ACL extern struct xattr_handler ocfs2_xattr_acl_access_handler; extern struct xattr_handler ocfs2_xattr_acl_default_handler; -#endif extern struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); diff --git a/fs/open.c b/fs/open.c index ca69241796b..040cef72bc0 100644 --- a/fs/open.c +++ b/fs/open.c @@ -821,15 +821,14 @@ static inline int __get_file_write_access(struct inode *inode, } static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, - int flags, struct file *f, + struct file *f, int (*open)(struct inode *, struct file *), const struct cred *cred) { struct inode *inode; int error; - f->f_flags = flags; - f->f_mode = (__force fmode_t)((flags+1) & O_ACCMODE) | FMODE_LSEEK | + f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; inode = dentry->d_inode; if (f->f_mode & FMODE_WRITE) { @@ -930,7 +929,6 @@ struct file *lookup_instantiate_filp(struct nameidata *nd, struct dentry *dentry if (IS_ERR(dentry)) goto out_err; nd->intent.open.file = __dentry_open(dget(dentry), mntget(nd->path.mnt), - nd->intent.open.flags - 1, nd->intent.open.file, open, cred); out: @@ -949,7 +947,7 @@ EXPORT_SYMBOL_GPL(lookup_instantiate_filp); * * Note that this function destroys the original nameidata */ -struct file *nameidata_to_filp(struct nameidata *nd, int flags) +struct file *nameidata_to_filp(struct nameidata *nd) { const struct cred *cred = current_cred(); struct file *filp; @@ -958,7 +956,7 @@ struct file *nameidata_to_filp(struct nameidata *nd, int flags) filp = nd->intent.open.file; /* Has the filesystem initialised the file for us? */ if (filp->f_path.dentry == NULL) - filp = __dentry_open(nd->path.dentry, nd->path.mnt, flags, filp, + filp = __dentry_open(nd->path.dentry, nd->path.mnt, filp, NULL, cred); else path_put(&nd->path); @@ -997,7 +995,8 @@ struct file *dentry_open(struct dentry *dentry, struct vfsmount *mnt, int flags, return ERR_PTR(error); } - return __dentry_open(dentry, mnt, flags, f, NULL, cred); + f->f_flags = flags; + return __dentry_open(dentry, mnt, f, NULL, cred); } EXPORT_SYMBOL(dentry_open); diff --git a/fs/proc/array.c b/fs/proc/array.c index 4badde179b1..13b5d070817 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -134,13 +134,16 @@ static inline void task_name(struct seq_file *m, struct task_struct *p) * simple bit tests. */ static const char *task_state_array[] = { - "R (running)", /* 0 */ - "S (sleeping)", /* 1 */ - "D (disk sleep)", /* 2 */ - "T (stopped)", /* 4 */ - "T (tracing stop)", /* 8 */ - "Z (zombie)", /* 16 */ - "X (dead)" /* 32 */ + "R (running)", /* 0 */ + "S (sleeping)", /* 1 */ + "D (disk sleep)", /* 2 */ + "T (stopped)", /* 4 */ + "t (tracing stop)", /* 8 */ + "Z (zombie)", /* 16 */ + "X (dead)", /* 32 */ + "x (dead)", /* 64 */ + "K (wakekill)", /* 128 */ + "W (waking)", /* 256 */ }; static inline const char *get_task_state(struct task_struct *tsk) @@ -148,6 +151,8 @@ static inline const char *get_task_state(struct task_struct *tsk) unsigned int state = (tsk->state & TASK_REPORT) | tsk->exit_state; const char **p = &task_state_array[0]; + BUILD_BUG_ON(1 + ilog2(TASK_STATE_MAX) != ARRAY_SIZE(task_state_array)); + while (state) { p++; state >>= 1; @@ -322,94 +327,6 @@ static inline void task_context_switch_counts(struct seq_file *m, p->nivcsw); } -#ifdef CONFIG_MMU - -struct stack_stats { - struct vm_area_struct *vma; - unsigned long startpage; - unsigned long usage; -}; - -static int stack_usage_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, struct mm_walk *walk) -{ - struct stack_stats *ss = walk->private; - struct vm_area_struct *vma = ss->vma; - pte_t *pte, ptent; - spinlock_t *ptl; - int ret = 0; - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (; addr != end; pte++, addr += PAGE_SIZE) { - ptent = *pte; - -#ifdef CONFIG_STACK_GROWSUP - if (pte_present(ptent) || is_swap_pte(ptent)) - ss->usage = addr - ss->startpage + PAGE_SIZE; -#else - if (pte_present(ptent) || is_swap_pte(ptent)) { - ss->usage = ss->startpage - addr + PAGE_SIZE; - pte++; - ret = 1; - break; - } -#endif - } - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - return ret; -} - -static inline unsigned long get_stack_usage_in_bytes(struct vm_area_struct *vma, - struct task_struct *task) -{ - struct stack_stats ss; - struct mm_walk stack_walk = { - .pmd_entry = stack_usage_pte_range, - .mm = vma->vm_mm, - .private = &ss, - }; - - if (!vma->vm_mm || is_vm_hugetlb_page(vma)) - return 0; - - ss.vma = vma; - ss.startpage = task->stack_start & PAGE_MASK; - ss.usage = 0; - -#ifdef CONFIG_STACK_GROWSUP - walk_page_range(KSTK_ESP(task) & PAGE_MASK, vma->vm_end, - &stack_walk); -#else - walk_page_range(vma->vm_start, (KSTK_ESP(task) & PAGE_MASK) + PAGE_SIZE, - &stack_walk); -#endif - return ss.usage; -} - -static inline void task_show_stack_usage(struct seq_file *m, - struct task_struct *task) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = get_task_mm(task); - - if (mm) { - down_read(&mm->mmap_sem); - vma = find_vma(mm, task->stack_start); - if (vma) - seq_printf(m, "Stack usage:\t%lu kB\n", - get_stack_usage_in_bytes(vma, task) >> 10); - - up_read(&mm->mmap_sem); - mmput(mm); - } -} -#else -static void task_show_stack_usage(struct seq_file *m, struct task_struct *task) -{ -} -#endif /* CONFIG_MMU */ - static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t"); @@ -440,7 +357,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, task_show_regs(m, task); #endif task_context_switch_counts(m, task); - task_show_stack_usage(m, task); return 0; } diff --git a/fs/proc/base.c b/fs/proc/base.c index 18d5cc62d8e..e42bbd843ed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1419,7 +1419,6 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->path); - nd->last_type = LAST_BIND; out: return ERR_PTR(error); } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 47c03f4336b..f277c4a111c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -361,12 +361,11 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (!pte_present(ptent)) continue; - mss->resident += PAGE_SIZE; - page = vm_normal_page(vma, addr, ptent); if (!page) continue; + mss->resident += PAGE_SIZE; /* Accumulate the size in pages that have been accessed. */ if (pte_young(ptent) || PageReferenced(page)) mss->referenced += PAGE_SIZE; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index cd6bb9a33c1..3fc62b097be 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -323,6 +323,30 @@ int dquot_mark_dquot_dirty(struct dquot *dquot) } EXPORT_SYMBOL(dquot_mark_dquot_dirty); +/* Dirtify all the dquots - this can block when journalling */ +static inline int mark_all_dquot_dirty(struct dquot * const *dquot) +{ + int ret, err, cnt; + + ret = err = 0; + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (dquot[cnt]) + /* Even in case of error we have to continue */ + ret = mark_dquot_dirty(dquot[cnt]); + if (!err) + err = ret; + } + return err; +} + +static inline void dqput_all(struct dquot **dquot) +{ + unsigned int cnt; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + dqput(dquot[cnt]); +} + /* This function needs dq_list_lock */ static inline int clear_dquot_dirty(struct dquot *dquot) { @@ -1268,8 +1292,7 @@ int dquot_initialize(struct inode *inode, int type) out_err: up_write(&sb_dqopt(sb)->dqptr_sem); /* Drop unused references */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(got[cnt]); + dqput_all(got); return ret; } EXPORT_SYMBOL(dquot_initialize); @@ -1288,9 +1311,7 @@ int dquot_drop(struct inode *inode) inode->i_dquot[cnt] = NULL; } up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - dqput(put[cnt]); + dqput_all(put); return 0; } EXPORT_SYMBOL(dquot_drop); @@ -1319,6 +1340,70 @@ void vfs_dq_drop(struct inode *inode) EXPORT_SYMBOL(vfs_dq_drop); /* + * inode_reserved_space is managed internally by quota, and protected by + * i_lock similar to i_blocks+i_bytes. + */ +static qsize_t *inode_reserved_space(struct inode * inode) +{ + /* Filesystem must explicitly define it's own method in order to use + * quota reservation interface */ + BUG_ON(!inode->i_sb->dq_op->get_reserved_space); + return inode->i_sb->dq_op->get_reserved_space(inode); +} + +static void inode_add_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) += number; + spin_unlock(&inode->i_lock); +} + + +static void inode_claim_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + __inode_add_bytes(inode, number); + spin_unlock(&inode->i_lock); +} + +static void inode_sub_rsv_space(struct inode *inode, qsize_t number) +{ + spin_lock(&inode->i_lock); + *inode_reserved_space(inode) -= number; + spin_unlock(&inode->i_lock); +} + +static qsize_t inode_get_rsv_space(struct inode *inode) +{ + qsize_t ret; + + if (!inode->i_sb->dq_op->get_reserved_space) + return 0; + spin_lock(&inode->i_lock); + ret = *inode_reserved_space(inode); + spin_unlock(&inode->i_lock); + return ret; +} + +static void inode_incr_space(struct inode *inode, qsize_t number, + int reserve) +{ + if (reserve) + inode_add_rsv_space(inode, number); + else + inode_add_bytes(inode, number); +} + +static void inode_decr_space(struct inode *inode, qsize_t number, int reserve) +{ + if (reserve) + inode_sub_rsv_space(inode, number); + else + inode_sub_bytes(inode, number); +} + +/* * Following four functions update i_blocks+i_bytes fields and * quota information (together with appropriate checks) * NOTE: We absolutely rely on the fact that caller dirties @@ -1336,6 +1421,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int cnt, ret = QUOTA_OK; char warntype[MAXQUOTAS]; + /* + * First test before acquiring mutex - solves deadlocks when we + * re-enter the quota code and are already holding the mutex + */ + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out; + } + + down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); + if (IS_NOQUOTA(inode)) { + inode_incr_space(inode, number, reserve); + goto out_unlock; + } + for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1346,7 +1446,8 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt) == NO_QUOTA) { ret = NO_QUOTA; - goto out_unlock; + spin_unlock(&dq_data_lock); + goto out_flush_warn; } } for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1357,64 +1458,29 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, else dquot_incr_space(inode->i_dquot[cnt], number); } - if (!reserve) - inode_add_bytes(inode, number); -out_unlock: + inode_incr_space(inode, number, reserve); spin_unlock(&dq_data_lock); + + if (reserve) + goto out_flush_warn; + mark_all_dquot_dirty(inode->i_dquot); +out_flush_warn: flush_warnings(inode->i_dquot, warntype); +out_unlock: + up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); +out: return ret; } int dquot_alloc_space(struct inode *inode, qsize_t number, int warn) { - int cnt, ret = QUOTA_OK; - - /* - * First test before acquiring mutex - solves deadlocks when we - * re-enter the quota code and are already holding the mutex - */ - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out; - } - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); - goto out_unlock; - } - - ret = __dquot_alloc_space(inode, number, warn, 0); - if (ret == NO_QUOTA) - goto out_unlock; - - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 0); } EXPORT_SYMBOL(dquot_alloc_space); int dquot_reserve_space(struct inode *inode, qsize_t number, int warn) { - int ret = QUOTA_OK; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - ret = __dquot_alloc_space(inode, number, warn, 1); -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return ret; + return __dquot_alloc_space(inode, number, warn, 1); } EXPORT_SYMBOL(dquot_reserve_space); @@ -1455,10 +1521,7 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number) warn_put_all: spin_unlock(&dq_data_lock); if (ret == QUOTA_OK) - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return ret; @@ -1471,14 +1534,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number) int ret = QUOTA_OK; if (IS_NOQUOTA(inode)) { - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); if (IS_NOQUOTA(inode)) { up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); goto out; } @@ -1490,12 +1553,9 @@ int dquot_claim_space(struct inode *inode, qsize_t number) number); } /* Update inode bytes */ - inode_add_bytes(inode, number); + inode_claim_rsv_space(inode, number); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); out: return ret; @@ -1503,38 +1563,9 @@ out: EXPORT_SYMBOL(dquot_claim_space); /* - * Release reserved quota space - */ -void dquot_release_reserved_space(struct inode *inode, qsize_t number) -{ - int cnt; - - if (IS_NOQUOTA(inode)) - goto out; - - down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); - if (IS_NOQUOTA(inode)) - goto out_unlock; - - spin_lock(&dq_data_lock); - /* Release reserved dquots */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (inode->i_dquot[cnt]) - dquot_free_reserved_space(inode->i_dquot[cnt], number); - } - spin_unlock(&dq_data_lock); - -out_unlock: - up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); -out: - return; -} -EXPORT_SYMBOL(dquot_release_reserved_space); - -/* * This operation can block, but only after everything is updated */ -int dquot_free_space(struct inode *inode, qsize_t number) +int __dquot_free_space(struct inode *inode, qsize_t number, int reserve) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1543,7 +1574,7 @@ int dquot_free_space(struct inode *inode, qsize_t number) * re-enter the quota code and are already holding the mutex */ if (IS_NOQUOTA(inode)) { out_sub: - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); return QUOTA_OK; } @@ -1558,21 +1589,40 @@ out_sub: if (!inode->i_dquot[cnt]) continue; warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number); - dquot_decr_space(inode->i_dquot[cnt], number); + if (reserve) + dquot_free_reserved_space(inode->i_dquot[cnt], number); + else + dquot_decr_space(inode->i_dquot[cnt], number); } - inode_sub_bytes(inode, number); + inode_decr_space(inode, number, reserve); spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + + if (reserve) + goto out_unlock; + mark_all_dquot_dirty(inode->i_dquot); +out_unlock: flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; } + +int dquot_free_space(struct inode *inode, qsize_t number) +{ + return __dquot_free_space(inode, number, 0); +} EXPORT_SYMBOL(dquot_free_space); /* + * Release reserved quota space + */ +void dquot_release_reserved_space(struct inode *inode, qsize_t number) +{ + __dquot_free_space(inode, number, 1); + +} +EXPORT_SYMBOL(dquot_release_reserved_space); + +/* * This operation can block, but only after everything is updated */ int dquot_free_inode(const struct inode *inode, qsize_t number) @@ -1599,10 +1649,7 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) dquot_decr_inodes(inode->i_dquot[cnt], number); } spin_unlock(&dq_data_lock); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if (inode->i_dquot[cnt]) - mark_dquot_dirty(inode->i_dquot[cnt]); + mark_all_dquot_dirty(inode->i_dquot); flush_warnings(inode->i_dquot, warntype); up_read(&sb_dqopt(inode->i_sb)->dqptr_sem); return QUOTA_OK; @@ -1610,19 +1657,6 @@ int dquot_free_inode(const struct inode *inode, qsize_t number) EXPORT_SYMBOL(dquot_free_inode); /* - * call back function, get reserved quota space from underlying fs - */ -qsize_t dquot_get_reserved_space(struct inode *inode) -{ - qsize_t reserved_space = 0; - - if (sb_any_quota_active(inode->i_sb) && - inode->i_sb->dq_op->get_reserved_space) - reserved_space = inode->i_sb->dq_op->get_reserved_space(inode); - return reserved_space; -} - -/* * Transfer the number of inode and blocks from one diskquota to an other. * * This operation can block, but only after everything is updated @@ -1665,7 +1699,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) } spin_lock(&dq_data_lock); cur_space = inode_get_bytes(inode); - rsv_space = dquot_get_reserved_space(inode); + rsv_space = inode_get_rsv_space(inode); space = cur_space + rsv_space; /* Build the transfer_from list and check the limits */ for (cnt = 0; cnt < MAXQUOTAS; cnt++) { @@ -1709,25 +1743,18 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) spin_unlock(&dq_data_lock); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); - /* Dirtify all the dquots - this can block when journalling */ - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - if (transfer_from[cnt]) - mark_dquot_dirty(transfer_from[cnt]); - if (transfer_to[cnt]) { - mark_dquot_dirty(transfer_to[cnt]); - /* The reference we got is transferred to the inode */ - transfer_to[cnt] = NULL; - } - } + mark_all_dquot_dirty(transfer_from); + mark_all_dquot_dirty(transfer_to); + /* The reference we got is transferred to the inode */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + transfer_to[cnt] = NULL; warn_put_all: flush_warnings(transfer_to, warntype_to); flush_warnings(transfer_from, warntype_from_inodes); flush_warnings(transfer_from, warntype_from_space); put_all: - for (cnt = 0; cnt < MAXQUOTAS; cnt++) { - dqput(transfer_from[cnt]); - dqput(transfer_to[cnt]); - } + dqput_all(transfer_from); + dqput_all(transfer_to); return ret; over_quota: spin_unlock(&dq_data_lock); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 3dfc23e0213..e3da02f4986 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -97,8 +97,11 @@ static int v2_read_file_info(struct super_block *sb, int type) unsigned int version; if (!v2_read_header(sb, type, &dqhead)) - return 0; + return -1; version = le32_to_cpu(dqhead.dqh_version); + if ((info->dqi_fmt_id == QFMT_VFS_V0 && version != 0) || + (info->dqi_fmt_id == QFMT_VFS_V1 && version != 1)) + return -1; size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); @@ -120,8 +123,8 @@ static int v2_read_file_info(struct super_block *sb, int type) info->dqi_maxilimit = 0xffffffff; } else { /* used space is stored as unsigned 64-bit value */ - info->dqi_maxblimit = 0xffffffffffffffff; /* 2^64-1 */ - info->dqi_maxilimit = 0xffffffffffffffff; + info->dqi_maxblimit = 0xffffffffffffffffULL; /* 2^64-1 */ + info->dqi_maxilimit = 0xffffffffffffffffULL; } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 32fae4040eb..1739a4aba25 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -60,7 +60,7 @@ const struct inode_operations ramfs_file_inode_operations = { */ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { - unsigned long npages, xpages, loop, limit; + unsigned long npages, xpages, loop; struct page *pages; unsigned order; void *data; @@ -123,30 +123,6 @@ add_error: /*****************************************************************************/ /* - * check that file shrinkage doesn't leave any VMAs dangling in midair - */ -static int ramfs_nommu_check_mappings(struct inode *inode, - size_t newsize, size_t size) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - - /* search for VMAs that fall within the dead zone */ - vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, - newsize >> PAGE_SHIFT, - (size + PAGE_SIZE - 1) >> PAGE_SHIFT - ) { - /* found one - only interested if it's shared out of the page - * cache */ - if (vma->vm_flags & VM_SHARED) - return -ETXTBSY; /* not quite true, but near enough */ - } - - return 0; -} - -/*****************************************************************************/ -/* * */ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) @@ -164,7 +140,7 @@ static int ramfs_nommu_resize(struct inode *inode, loff_t newsize, loff_t size) /* check that a decrease in size doesn't cut off any shared mappings */ if (newsize < size) { - ret = ramfs_nommu_check_mappings(inode, newsize, size); + ret = nommu_shrink_inode_mappings(inode, size, newsize); if (ret < 0) return ret; } diff --git a/fs/reiserfs/Kconfig b/fs/reiserfs/Kconfig index ac7cd75c86f..513f431038f 100644 --- a/fs/reiserfs/Kconfig +++ b/fs/reiserfs/Kconfig @@ -1,7 +1,6 @@ config REISERFS_FS tristate "Reiserfs support" select CRC32 - select FS_JOURNAL_INFO help Stores not just filenames but the files themselves in a balanced tree. Uses journalling. diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index 68549570718..65c87276117 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -1277,7 +1277,10 @@ int reiserfs_init_bitmap_cache(struct super_block *sb) struct reiserfs_bitmap_info *bitmap; unsigned int bmap_nr = reiserfs_bmap_count(sb); + /* Avoid lock recursion in fault case */ + reiserfs_write_unlock(sb); bitmap = vmalloc(sizeof(*bitmap) * bmap_nr); + reiserfs_write_lock(sb); if (bitmap == NULL) return -ENOMEM; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 3a28e7751b3..9087b10209e 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -31,11 +31,12 @@ void reiserfs_delete_inode(struct inode *inode) JOURNAL_PER_BALANCE_CNT * 2 + 2 * REISERFS_QUOTA_INIT_BLOCKS(inode->i_sb); struct reiserfs_transaction_handle th; + int depth; int err; truncate_inode_pages(&inode->i_data, 0); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); /* The = 0 happens when we abort creating a new inode for some reason like lack of space.. */ if (!(inode->i_state & I_NEW) && INODE_PKEY(inode)->k_objectid != 0) { /* also handles bad_inode case */ @@ -74,7 +75,7 @@ void reiserfs_delete_inode(struct inode *inode) out: clear_inode(inode); /* note this must go after the journal_end to prevent deadlock */ inode->i_blocks = 0; - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); } static void _make_cpu_key(struct cpu_key *key, int version, __u32 dirid, @@ -2538,6 +2539,12 @@ static int reiserfs_writepage(struct page *page, struct writeback_control *wbc) return reiserfs_write_full_page(page, wbc); } +static void reiserfs_truncate_failed_write(struct inode *inode) +{ + truncate_inode_pages(inode->i_mapping, inode->i_size); + reiserfs_truncate_file(inode, 0); +} + static int reiserfs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -2604,6 +2611,8 @@ static int reiserfs_write_begin(struct file *file, if (ret) { unlock_page(page); page_cache_release(page); + /* Truncate allocated blocks */ + reiserfs_truncate_failed_write(inode); } return ret; } @@ -2701,9 +2710,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, ** transaction tracking stuff when the size changes. So, we have ** to do the i_size updates here. */ - pos += copied; - - if (pos > inode->i_size) { + if (pos + copied > inode->i_size) { struct reiserfs_transaction_handle myth; lock_depth = reiserfs_write_lock_once(inode->i_sb); locked = true; @@ -2721,7 +2728,7 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, goto journal_error; reiserfs_update_inode_transaction(inode); - inode->i_size = pos; + inode->i_size = pos + copied; /* * this will just nest into our transaction. It's important * to use mark_inode_dirty so the inode gets pushed around on the @@ -2751,6 +2758,10 @@ static int reiserfs_write_end(struct file *file, struct address_space *mapping, reiserfs_write_unlock_once(inode->i_sb, lock_depth); unlock_page(page); page_cache_release(page); + + if (pos + len > inode->i_size) + reiserfs_truncate_failed_write(inode); + return ret == 0 ? copied : ret; journal_error: @@ -3051,13 +3062,14 @@ static ssize_t reiserfs_direct_IO(int rw, struct kiocb *iocb, int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - int error; unsigned int ia_valid; + int depth; + int error; /* must be turned off for recursive notify_change calls */ ia_valid = attr->ia_valid &= ~(ATTR_KILL_SUID|ATTR_KILL_SGID); - reiserfs_write_lock(inode->i_sb); + depth = reiserfs_write_lock_once(inode->i_sb); if (attr->ia_valid & ATTR_SIZE) { /* version 2 items will be caught by the s_maxbytes check ** done for us in vmtruncate @@ -3138,8 +3150,17 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) journal_end(&th, inode->i_sb, jbegin_count); } } - if (!error) + if (!error) { + /* + * Relax the lock here, as it might truncate the + * inode pages and wait for inode pages locks. + * To release such page lock, the owner needs the + * reiserfs lock + */ + reiserfs_write_unlock_once(inode->i_sb, depth); error = inode_setattr(inode, attr); + depth = reiserfs_write_lock_once(inode->i_sb); + } } if (!error && reiserfs_posixacl(inode->i_sb)) { @@ -3148,7 +3169,8 @@ int reiserfs_setattr(struct dentry *dentry, struct iattr *attr) } out: - reiserfs_write_unlock(inode->i_sb); + reiserfs_write_unlock_once(inode->i_sb, depth); + return error; } diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index ace77451ceb..f53505de071 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -104,9 +104,10 @@ setflags_out: err = put_user(inode->i_generation, (int __user *)arg); break; case REISERFS_IOC_SETVERSION: - if (!is_owner_or_cap(inode)) + if (!is_owner_or_cap(inode)) { err = -EPERM; break; + } err = mnt_want_write(filp->f_path.mnt); if (err) break; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 2f8a7e7b8da..ba98546fabb 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -2009,10 +2009,11 @@ static int do_journal_release(struct reiserfs_transaction_handle *th, destroy_workqueue(commit_wq); commit_wq = NULL; } - reiserfs_write_lock(sb); free_journal_ram(sb); + reiserfs_write_lock(sb); + return 0; } @@ -2758,11 +2759,18 @@ int journal_init(struct super_block *sb, const char *j_dev_name, struct reiserfs_journal *journal; struct reiserfs_journal_list *jl; char b[BDEVNAME_SIZE]; + int ret; + /* + * Unlock here to avoid various RECLAIM-FS-ON <-> IN-RECLAIM-FS + * dependency inversion warnings. + */ + reiserfs_write_unlock(sb); journal = SB_JOURNAL(sb) = vmalloc(sizeof(struct reiserfs_journal)); if (!journal) { reiserfs_warning(sb, "journal-1256", "unable to get memory for journal structure"); + reiserfs_write_lock(sb); return 1; } memset(journal, 0, sizeof(struct reiserfs_journal)); @@ -2771,10 +2779,12 @@ int journal_init(struct super_block *sb, const char *j_dev_name, INIT_LIST_HEAD(&journal->j_working_list); INIT_LIST_HEAD(&journal->j_journal_list); journal->j_persistent_trans = 0; - if (reiserfs_allocate_list_bitmaps(sb, - journal->j_list_bitmap, - reiserfs_bmap_count(sb))) + ret = reiserfs_allocate_list_bitmaps(sb, journal->j_list_bitmap, + reiserfs_bmap_count(sb)); + reiserfs_write_lock(sb); + if (ret) goto free_and_return; + allocate_bitmap_nodes(sb); /* reserved for journal area support */ @@ -2903,7 +2913,9 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_mount_id = 10; journal->j_state = 0; atomic_set(&(journal->j_jlock), 0); + reiserfs_write_unlock(sb); journal->j_cnode_free_list = allocate_cnodes(num_cnodes); + reiserfs_write_lock(sb); journal->j_cnode_free_orig = journal->j_cnode_free_list; journal->j_cnode_free = journal->j_cnode_free_list ? num_cnodes : 0; journal->j_cnode_used = 0; diff --git a/fs/reiserfs/lock.c b/fs/reiserfs/lock.c index ee2cfc0fd8a..b87aa2c1afc 100644 --- a/fs/reiserfs/lock.c +++ b/fs/reiserfs/lock.c @@ -86,3 +86,12 @@ void reiserfs_check_lock_depth(struct super_block *sb, char *caller) reiserfs_panic(sb, "%s called without kernel lock held %d", caller); } + +#ifdef CONFIG_REISERFS_CHECK +void reiserfs_lock_check_recursive(struct super_block *sb) +{ + struct reiserfs_sb_info *sb_i = REISERFS_SB(sb); + + WARN_ONCE((sb_i->lock_depth > 0), "Unwanted recursive reiserfs lock!\n"); +} +#endif diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index e296ff72a6c..9d4dcf0b07c 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -921,6 +921,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) struct reiserfs_transaction_handle th; int jbegin_count; unsigned long savelink; + int depth; inode = dentry->d_inode; @@ -932,7 +933,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) JOURNAL_PER_BALANCE_CNT * 2 + 2 + 4 * REISERFS_QUOTA_TRANS_BLOCKS(dir->i_sb); - reiserfs_write_lock(dir->i_sb); + depth = reiserfs_write_lock_once(dir->i_sb); retval = journal_begin(&th, dir->i_sb, jbegin_count); if (retval) goto out_unlink; @@ -993,7 +994,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) retval = journal_end(&th, dir->i_sb, jbegin_count); reiserfs_check_path(&path); - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, depth); return retval; end_unlink: @@ -1003,7 +1004,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) if (err) retval = err; out_unlink: - reiserfs_write_unlock(dir->i_sb); + reiserfs_write_unlock_once(dir->i_sb, depth); return retval; } diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 8c7033a8b67..81f09fab8ae 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -83,7 +83,8 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); vfs_dq_init(dir); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); error = dir->i_op->unlink(dir, dentry); mutex_unlock(&dentry->d_inode->i_mutex); @@ -98,7 +99,8 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) BUG_ON(!mutex_is_locked(&dir->i_mutex)); vfs_dq_init(dir); - mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_CHILD); + reiserfs_mutex_lock_nested_safe(&dentry->d_inode->i_mutex, + I_MUTEX_CHILD, dir->i_sb); dentry_unhash(dentry); error = dir->i_op->rmdir(dir, dentry); if (!error) @@ -235,16 +237,22 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (IS_PRIVATE(inode) || get_inode_sd_version(inode) == STAT_DATA_V1) return 0; + reiserfs_write_unlock(inode->i_sb); dir = open_xa_dir(inode, XATTR_REPLACE); if (IS_ERR(dir)) { err = PTR_ERR(dir); + reiserfs_write_lock(inode->i_sb); goto out; } else if (!dir->d_inode) { err = 0; + reiserfs_write_lock(inode->i_sb); goto out_dir; } mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_XATTR); + + reiserfs_write_lock(inode->i_sb); + buf.xadir = dir; err = reiserfs_readdir_dentry(dir, &buf, fill_with_dentries, &pos); while ((err == 0 || err == -ENOSPC) && buf.count) { @@ -283,8 +291,9 @@ static int reiserfs_for_each_xattr(struct inode *inode, err = journal_begin(&th, inode->i_sb, blocks); if (!err) { int jerror; - mutex_lock_nested(&dir->d_parent->d_inode->i_mutex, - I_MUTEX_XATTR); + reiserfs_mutex_lock_nested_safe( + &dir->d_parent->d_inode->i_mutex, + I_MUTEX_XATTR, inode->i_sb); err = action(dir, data); jerror = journal_end(&th, inode->i_sb, blocks); mutex_unlock(&dir->d_parent->d_inode->i_mutex); @@ -443,7 +452,9 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) } if (dentry->d_inode) { + reiserfs_write_lock(inode->i_sb); err = xattr_unlink(xadir->d_inode, dentry); + reiserfs_write_unlock(inode->i_sb); update_ctime(inode); } @@ -477,15 +488,24 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, if (get_inode_sd_version(inode) == STAT_DATA_V1) return -EOPNOTSUPP; - if (!buffer) - return lookup_and_delete_xattr(inode, name); + reiserfs_write_unlock(inode->i_sb); + + if (!buffer) { + err = lookup_and_delete_xattr(inode, name); + reiserfs_write_lock(inode->i_sb); + return err; + } dentry = xattr_lookup(inode, name, flags); - if (IS_ERR(dentry)) + if (IS_ERR(dentry)) { + reiserfs_write_lock(inode->i_sb); return PTR_ERR(dentry); + } down_write(&REISERFS_I(inode)->i_xattr_sem); + reiserfs_write_lock(inode->i_sb); + xahash = xattr_hash(buffer, buffer_size); while (buffer_pos < buffer_size || buffer_pos == 0) { size_t chunk; @@ -540,8 +560,12 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_size = buffer_size, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; + + reiserfs_write_unlock(inode->i_sb); mutex_lock_nested(&dentry->d_inode->i_mutex, I_MUTEX_XATTR); down_write(&dentry->d_inode->i_alloc_sem); + reiserfs_write_lock(inode->i_sb); + err = reiserfs_setattr(dentry, &newattrs); up_write(&dentry->d_inode->i_alloc_sem); mutex_unlock(&dentry->d_inode->i_mutex); diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index cc32e6ada67..dd20a7883f0 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -455,7 +455,9 @@ int reiserfs_acl_chmod(struct inode *inode) return 0; } + reiserfs_write_unlock(inode->i_sb); acl = reiserfs_get_acl(inode, ACL_TYPE_ACCESS); + reiserfs_write_lock(inode->i_sb); if (!acl) return 0; if (IS_ERR(acl)) diff --git a/fs/romfs/super.c b/fs/romfs/super.c index c117fa80d1e..42d21354689 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -544,6 +544,7 @@ error: error_rsb_inval: ret = -EINVAL; error_rsb: + kfree(rsb); return ret; } diff --git a/fs/signalfd.c b/fs/signalfd.c index b07565c9438..1dabe4ee02f 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -236,7 +236,7 @@ SYSCALL_DEFINE4(signalfd4, int, ufd, sigset_t __user *, user_mask, * anon_inode_getfd() will install the fd. */ ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + O_RDWR | (flags & (O_CLOEXEC | O_NONBLOCK))); if (ufd < 0) kfree(ctx); } else { diff --git a/fs/stat.c b/fs/stat.c index 075694e31d8..c4ecd52c573 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -401,9 +401,9 @@ SYSCALL_DEFINE4(fstatat64, int, dfd, char __user *, filename, } #endif /* __ARCH_WANT_STAT64 */ -void inode_add_bytes(struct inode *inode, loff_t bytes) +/* Caller is here responsible for sufficient locking (ie. inode->i_lock) */ +void __inode_add_bytes(struct inode *inode, loff_t bytes) { - spin_lock(&inode->i_lock); inode->i_blocks += bytes >> 9; bytes &= 511; inode->i_bytes += bytes; @@ -411,6 +411,12 @@ void inode_add_bytes(struct inode *inode, loff_t bytes) inode->i_blocks++; inode->i_bytes -= 512; } +} + +void inode_add_bytes(struct inode *inode, loff_t bytes) +{ + spin_lock(&inode->i_lock); + __inode_add_bytes(inode, bytes); spin_unlock(&inode->i_lock); } diff --git a/fs/super.c b/fs/super.c index 19eb70b374b..aff046b0fe7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -901,8 +901,9 @@ int get_sb_single(struct file_system_type *fs_type, return error; } s->s_flags |= MS_ACTIVE; + } else { + do_remount_sb(s, flags, data, 0); } - do_remount_sb(s, flags, data, 0); simple_set_mnt(mnt, s); return 0; } diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 60c702bc10a..a0a500af24a 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -483,7 +483,8 @@ void unmap_bin_file(struct sysfs_dirent *attr_sd) * @attr: attribute descriptor. */ -int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) +int sysfs_create_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { BUG_ON(!kobj || !kobj->sd || !attr); @@ -497,7 +498,8 @@ int sysfs_create_bin_file(struct kobject * kobj, struct bin_attribute * attr) * @attr: attribute descriptor. */ -void sysfs_remove_bin_file(struct kobject * kobj, struct bin_attribute * attr) +void sysfs_remove_bin_file(struct kobject *kobj, + const struct bin_attribute *attr) { sysfs_hash_and_remove(kobj->sd, attr->attr.name); } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f05f2303a8b..699f371b9f1 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -106,8 +106,10 @@ static struct sysfs_dirent *sysfs_get_active(struct sysfs_dirent *sd) return NULL; t = atomic_cmpxchg(&sd->s_active, v, v + 1); - if (likely(t == v)) + if (likely(t == v)) { + rwsem_acquire_read(&sd->dep_map, 0, 1, _RET_IP_); return sd; + } if (t < 0) return NULL; @@ -130,6 +132,7 @@ static void sysfs_put_active(struct sysfs_dirent *sd) if (unlikely(!sd)) return; + rwsem_release(&sd->dep_map, 1, _RET_IP_); v = atomic_dec_return(&sd->s_active); if (likely(v != SD_DEACTIVATED_BIAS)) return; @@ -194,15 +197,21 @@ static void sysfs_deactivate(struct sysfs_dirent *sd) BUG_ON(sd->s_sibling || !(sd->s_flags & SYSFS_FLAG_REMOVED)); sd->s_sibling = (void *)&wait; + rwsem_acquire(&sd->dep_map, 0, 0, _RET_IP_); /* atomic_add_return() is a mb(), put_active() will always see * the updated sd->s_sibling. */ v = atomic_add_return(SD_DEACTIVATED_BIAS, &sd->s_active); - if (v != SD_DEACTIVATED_BIAS) + if (v != SD_DEACTIVATED_BIAS) { + lock_contended(&sd->dep_map, _RET_IP_); wait_for_completion(&wait); + } sd->s_sibling = NULL; + + lock_acquired(&sd->dep_map, _RET_IP_); + rwsem_release(&sd->dep_map, 1, _RET_IP_); } static int sysfs_alloc_ino(ino_t *pino) @@ -345,6 +354,7 @@ struct sysfs_dirent *sysfs_new_dirent(const char *name, umode_t mode, int type) atomic_set(&sd->s_count, 1); atomic_set(&sd->s_active, 0); + sysfs_dirent_init_lockdep(sd); sd->s_name = name; sd->s_mode = mode; diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ca52e7b9d8f..cdd9377a6e0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -8,6 +8,7 @@ * This file is released under the GPLv2. */ +#include <linux/lockdep.h> #include <linux/fs.h> struct sysfs_open_dirent; @@ -50,6 +51,9 @@ struct sysfs_inode_attrs { struct sysfs_dirent { atomic_t s_count; atomic_t s_active; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif struct sysfs_dirent *s_parent; struct sysfs_dirent *s_sibling; const char *s_name; @@ -84,6 +88,17 @@ static inline unsigned int sysfs_type(struct sysfs_dirent *sd) return sd->s_flags & SYSFS_TYPE_MASK; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define sysfs_dirent_init_lockdep(sd) \ +do { \ + static struct lock_class_key __key; \ + \ + lockdep_init_map(&sd->dep_map, "s_active", &__key, 0); \ +} while(0) +#else +#define sysfs_dirent_init_lockdep(sd) do {} while(0) +#endif + /* * Context structure to be used while adding/removing nodes. */ diff --git a/fs/timerfd.c b/fs/timerfd.c index b042bd7034b..1bfc95ad5f7 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -200,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & TFD_SHARED_FCNTL_FLAGS); + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); if (ufd < 0) kfree(ctx); diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 618c2701d3a..e5a3d8e96bb 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -54,6 +54,7 @@ */ #include <linux/pagemap.h> +#include <linux/list_sort.h> #include "ubifs.h" /* @@ -108,101 +109,6 @@ static int switch_gc_head(struct ubifs_info *c) } /** - * list_sort - sort a list. - * @priv: private data, passed to @cmp - * @head: the list to sort - * @cmp: the elements comparison function - * - * This function has been implemented by Mark J Roberts <mjr@znex.org>. It - * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted - * in ascending order. - * - * The comparison function @cmp is supposed to return a negative value if @a is - * than @b, and a positive value if @a is greater than @b. If @a and @b are - * equivalent, then it does not matter what this function returns. - */ -static void list_sort(void *priv, struct list_head *head, - int (*cmp)(void *priv, struct list_head *a, - struct list_head *b)) -{ - struct list_head *p, *q, *e, *list, *tail, *oldhead; - int insize, nmerges, psize, qsize, i; - - if (list_empty(head)) - return; - - list = head->next; - list_del(head); - insize = 1; - for (;;) { - p = oldhead = list; - list = tail = NULL; - nmerges = 0; - - while (p) { - nmerges++; - q = p; - psize = 0; - for (i = 0; i < insize; i++) { - psize++; - q = q->next == oldhead ? NULL : q->next; - if (!q) - break; - } - - qsize = insize; - while (psize > 0 || (qsize > 0 && q)) { - if (!psize) { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } else if (!qsize || !q) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else if (cmp(priv, p, q) <= 0) { - e = p; - p = p->next; - psize--; - if (p == oldhead) - p = NULL; - } else { - e = q; - q = q->next; - qsize--; - if (q == oldhead) - q = NULL; - } - if (tail) - tail->next = e; - else - list = e; - e->prev = tail; - tail = e; - } - p = q; - } - - tail->next = list; - list->prev = tail; - - if (nmerges <= 1) - break; - - insize *= 2; - } - - head->next = list; - head->prev = list->prev; - list->prev->next = head; - list->prev = head; -} - -/** * data_nodes_cmp - compare 2 data nodes. * @priv: UBIFS file-system description object * @a: first data node diff --git a/fs/xfs/linux-2.6/xfs_acl.c b/fs/xfs/linux-2.6/xfs_acl.c index 2512125dfa7..883ca5ab8af 100644 --- a/fs/xfs/linux-2.6/xfs_acl.c +++ b/fs/xfs/linux-2.6/xfs_acl.c @@ -251,8 +251,9 @@ xfs_set_mode(struct inode *inode, mode_t mode) if (mode != inode->i_mode) { struct iattr iattr; - iattr.ia_valid = ATTR_MODE; + iattr.ia_valid = ATTR_MODE | ATTR_CTIME; iattr.ia_mode = mode; + iattr.ia_ctime = current_fs_time(inode->i_sb); error = -xfs_setattr(XFS_I(inode), &iattr, XFS_ATTR_NOACL); } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 09783cc444a..77414db10dc 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -954,16 +954,14 @@ xfs_fs_destroy_inode( ASSERT_ALWAYS(!xfs_iflags_test(ip, XFS_IRECLAIM)); /* - * If we have nothing to flush with this inode then complete the - * teardown now, otherwise delay the flush operation. + * We always use background reclaim here because even if the + * inode is clean, it still may be under IO and hence we have + * to take the flush lock. The background reclaim path handles + * this more efficiently than we can here, so simply let background + * reclaim tear down all inodes. */ - if (!xfs_inode_clean(ip)) { - xfs_inode_set_reclaim_tag(ip); - return; - } - out_reclaim: - xfs_ireclaim(ip); + xfs_inode_set_reclaim_tag(ip); } /* diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index 6fed97a8cd3..1f5e4bb5e97 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -65,7 +65,6 @@ xfs_inode_ag_lookup( * as the tree is sparse and a gang lookup walks to find * the number of objects requested. */ - read_lock(&pag->pag_ici_lock); if (tag == XFS_ICI_NO_TAG) { nr_found = radix_tree_gang_lookup(&pag->pag_ici_root, (void **)&ip, *first_index, 1); @@ -74,7 +73,7 @@ xfs_inode_ag_lookup( (void **)&ip, *first_index, 1, tag); } if (!nr_found) - goto unlock; + return NULL; /* * Update the index for the next lookup. Catch overflows @@ -84,13 +83,8 @@ xfs_inode_ag_lookup( */ *first_index = XFS_INO_TO_AGINO(mp, ip->i_ino + 1); if (*first_index < XFS_INO_TO_AGINO(mp, ip->i_ino)) - goto unlock; - + return NULL; return ip; - -unlock: - read_unlock(&pag->pag_ici_lock); - return NULL; } STATIC int @@ -100,7 +94,8 @@ xfs_inode_ag_walk( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { struct xfs_perag *pag = &mp->m_perag[ag]; uint32_t first_index; @@ -114,10 +109,20 @@ restart: int error = 0; xfs_inode_t *ip; + if (exclusive) + write_lock(&pag->pag_ici_lock); + else + read_lock(&pag->pag_ici_lock); ip = xfs_inode_ag_lookup(mp, pag, &first_index, tag); - if (!ip) + if (!ip) { + if (exclusive) + write_unlock(&pag->pag_ici_lock); + else + read_unlock(&pag->pag_ici_lock); break; + } + /* execute releases pag->pag_ici_lock */ error = execute(ip, pag, flags); if (error == EAGAIN) { skipped++; @@ -125,9 +130,8 @@ restart: } if (error) last_error = error; - /* - * bail out if the filesystem is corrupted. - */ + + /* bail out if the filesystem is corrupted. */ if (error == EFSCORRUPTED) break; @@ -148,7 +152,8 @@ xfs_inode_ag_iterator( int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), int flags, - int tag) + int tag, + int exclusive) { int error = 0; int last_error = 0; @@ -157,7 +162,8 @@ xfs_inode_ag_iterator( for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { if (!mp->m_perag[ag].pag_ici_init) continue; - error = xfs_inode_ag_walk(mp, ag, execute, flags, tag); + error = xfs_inode_ag_walk(mp, ag, execute, flags, tag, + exclusive); if (error) { last_error = error; if (error == EFSCORRUPTED) @@ -174,30 +180,31 @@ xfs_sync_inode_valid( struct xfs_perag *pag) { struct inode *inode = VFS_I(ip); + int error = EFSCORRUPTED; /* nothing to sync during shutdown */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) { - read_unlock(&pag->pag_ici_lock); - return EFSCORRUPTED; - } + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + goto out_unlock; - /* - * If we can't get a reference on the inode, it must be in reclaim. - * Leave it for the reclaim code to flush. Also avoid inodes that - * haven't been fully initialised. - */ - if (!igrab(inode)) { - read_unlock(&pag->pag_ici_lock); - return ENOENT; - } - read_unlock(&pag->pag_ici_lock); + /* avoid new or reclaimable inodes. Leave for reclaim code to flush */ + error = ENOENT; + if (xfs_iflags_test(ip, XFS_INEW | XFS_IRECLAIMABLE | XFS_IRECLAIM)) + goto out_unlock; - if (is_bad_inode(inode) || xfs_iflags_test(ip, XFS_INEW)) { + /* If we can't grab the inode, it must on it's way to reclaim. */ + if (!igrab(inode)) + goto out_unlock; + + if (is_bad_inode(inode)) { IRELE(ip); - return ENOENT; + goto out_unlock; } - return 0; + /* inode is valid */ + error = 0; +out_unlock: + read_unlock(&pag->pag_ici_lock); + return error; } STATIC int @@ -282,7 +289,7 @@ xfs_sync_data( ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0); error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); if (error) return XFS_ERROR(error); @@ -304,7 +311,7 @@ xfs_sync_attr( ASSERT((flags & ~SYNC_WAIT) == 0); return xfs_inode_ag_iterator(mp, xfs_sync_inode_attr, flags, - XFS_ICI_NO_TAG); + XFS_ICI_NO_TAG, 0); } STATIC int @@ -664,60 +671,6 @@ xfs_syncd_stop( kthread_stop(mp->m_sync_task); } -STATIC int -xfs_reclaim_inode( - xfs_inode_t *ip, - int sync_mode) -{ - xfs_perag_t *pag = xfs_get_perag(ip->i_mount, ip->i_ino); - - /* The hash lock here protects a thread in xfs_iget_core from - * racing with us on linking the inode back with a vnode. - * Once we have the XFS_IRECLAIM flag set it will not touch - * us. - */ - write_lock(&pag->pag_ici_lock); - spin_lock(&ip->i_flags_lock); - if (__xfs_iflags_test(ip, XFS_IRECLAIM) || - !__xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - return -EAGAIN; - } - __xfs_iflags_set(ip, XFS_IRECLAIM); - spin_unlock(&ip->i_flags_lock); - write_unlock(&pag->pag_ici_lock); - xfs_put_perag(ip->i_mount, pag); - - /* - * If the inode is still dirty, then flush it out. If the inode - * is not in the AIL, then it will be OK to flush it delwri as - * long as xfs_iflush() does not keep any references to the inode. - * We leave that decision up to xfs_iflush() since it has the - * knowledge of whether it's OK to simply do a delwri flush of - * the inode or whether we need to wait until the inode is - * pulled from the AIL. - * We get the flush lock regardless, though, just to make sure - * we don't free it while it is being flushed. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_iflock(ip); - - /* - * In the case of a forced shutdown we rely on xfs_iflush() to - * wait for the inode to be unpinned before returning an error. - */ - if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { - /* synchronize with xfs_iflush_done */ - xfs_iflock(ip); - xfs_ifunlock(ip); - } - - xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_ireclaim(ip); - return 0; -} - void __xfs_inode_set_reclaim_tag( struct xfs_perag *pag, @@ -760,19 +713,55 @@ __xfs_inode_clear_reclaim_tag( } STATIC int -xfs_reclaim_inode_now( +xfs_reclaim_inode( struct xfs_inode *ip, struct xfs_perag *pag, - int flags) + int sync_mode) { - /* ignore if already under reclaim */ - if (xfs_iflags_test(ip, XFS_IRECLAIM)) { - read_unlock(&pag->pag_ici_lock); + /* + * The radix tree lock here protects a thread in xfs_iget from racing + * with us starting reclaim on the inode. Once we have the + * XFS_IRECLAIM flag set it will not touch us. + */ + spin_lock(&ip->i_flags_lock); + ASSERT_ALWAYS(__xfs_iflags_test(ip, XFS_IRECLAIMABLE)); + if (__xfs_iflags_test(ip, XFS_IRECLAIM)) { + /* ignore as it is already under reclaim */ + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); return 0; } - read_unlock(&pag->pag_ici_lock); + __xfs_iflags_set(ip, XFS_IRECLAIM); + spin_unlock(&ip->i_flags_lock); + write_unlock(&pag->pag_ici_lock); - return xfs_reclaim_inode(ip, flags); + /* + * If the inode is still dirty, then flush it out. If the inode + * is not in the AIL, then it will be OK to flush it delwri as + * long as xfs_iflush() does not keep any references to the inode. + * We leave that decision up to xfs_iflush() since it has the + * knowledge of whether it's OK to simply do a delwri flush of + * the inode or whether we need to wait until the inode is + * pulled from the AIL. + * We get the flush lock regardless, though, just to make sure + * we don't free it while it is being flushed. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_iflock(ip); + + /* + * In the case of a forced shutdown we rely on xfs_iflush() to + * wait for the inode to be unpinned before returning an error. + */ + if (!is_bad_inode(VFS_I(ip)) && xfs_iflush(ip, sync_mode) == 0) { + /* synchronize with xfs_iflush_done */ + xfs_iflock(ip); + xfs_ifunlock(ip); + } + + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_ireclaim(ip); + return 0; } int @@ -780,6 +769,6 @@ xfs_reclaim_inodes( xfs_mount_t *mp, int mode) { - return xfs_inode_ag_iterator(mp, xfs_reclaim_inode_now, mode, - XFS_ICI_RECLAIM_TAG); + return xfs_inode_ag_iterator(mp, xfs_reclaim_inode, mode, + XFS_ICI_RECLAIM_TAG, 1); } diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index a500b4d9183..ea932b43335 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -54,6 +54,6 @@ void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, int xfs_sync_inode_valid(struct xfs_inode *ip, struct xfs_perag *pag); int xfs_inode_ag_iterator(struct xfs_mount *mp, int (*execute)(struct xfs_inode *ip, struct xfs_perag *pag, int flags), - int flags, int tag); + int flags, int tag, int write_lock); #endif diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h index c40834bdee5..c22a608321a 100644 --- a/fs/xfs/linux-2.6/xfs_trace.h +++ b/fs/xfs/linux-2.6/xfs_trace.h @@ -33,51 +33,55 @@ struct xfs_dquot; struct xlog_ticket; struct log; +DECLARE_EVENT_CLASS(xfs_attr_list_class, + TP_PROTO(struct xfs_attr_list_context *ctx), + TP_ARGS(ctx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(u32, hashval) + __field(u32, blkno) + __field(u32, offset) + __field(void *, alist) + __field(int, bufsize) + __field(int, count) + __field(int, firstu) + __field(int, dupcnt) + __field(int, flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; + __entry->ino = ctx->dp->i_ino; + __entry->hashval = ctx->cursor->hashval; + __entry->blkno = ctx->cursor->blkno; + __entry->offset = ctx->cursor->offset; + __entry->alist = ctx->alist; + __entry->bufsize = ctx->bufsize; + __entry->count = ctx->count; + __entry->firstu = ctx->firstu; + __entry->flags = ctx->flags; + ), + TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " + "alist 0x%p size %u count %u firstu %u flags %d %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->hashval, + __entry->blkno, + __entry->offset, + __entry->dupcnt, + __entry->alist, + __entry->bufsize, + __entry->count, + __entry->firstu, + __entry->flags, + __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) + ) +) + #define DEFINE_ATTR_LIST_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_attr_list_class, name, \ TP_PROTO(struct xfs_attr_list_context *ctx), \ - TP_ARGS(ctx), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(u32, hashval) \ - __field(u32, blkno) \ - __field(u32, offset) \ - __field(void *, alist) \ - __field(int, bufsize) \ - __field(int, count) \ - __field(int, firstu) \ - __field(int, dupcnt) \ - __field(int, flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ctx->dp)->i_sb->s_dev; \ - __entry->ino = ctx->dp->i_ino; \ - __entry->hashval = ctx->cursor->hashval; \ - __entry->blkno = ctx->cursor->blkno; \ - __entry->offset = ctx->cursor->offset; \ - __entry->alist = ctx->alist; \ - __entry->bufsize = ctx->bufsize; \ - __entry->count = ctx->count; \ - __entry->firstu = ctx->firstu; \ - __entry->flags = ctx->flags; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx cursor h/b/o 0x%x/0x%x/%u dupcnt %u " \ - "alist 0x%p size %u count %u firstu %u flags %d %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->hashval, \ - __entry->blkno, \ - __entry->offset, \ - __entry->dupcnt, \ - __entry->alist, \ - __entry->bufsize, \ - __entry->count, \ - __entry->firstu, \ - __entry->flags, \ - __print_flags(__entry->flags, "|", XFS_ATTR_FLAGS) \ - ) \ -) + TP_ARGS(ctx)) DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_sf_all); DEFINE_ATTR_LIST_EVENT(xfs_attr_list_leaf); @@ -178,91 +182,99 @@ TRACE_EVENT(xfs_iext_insert, (char *)__entry->caller_ip) ); +DECLARE_EVENT_CLASS(xfs_bmap_class, + TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, + unsigned long caller_ip), + TP_ARGS(ip, idx, state, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_extnum_t, idx) + __field(xfs_fileoff_t, startoff) + __field(xfs_fsblock_t, startblock) + __field(xfs_filblks_t, blockcount) + __field(xfs_exntst_t, state) + __field(int, bmap_state) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? + ip->i_afp : &ip->i_df; + struct xfs_bmbt_irec r; + + xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->idx = idx; + __entry->startoff = r.br_startoff; + __entry->startblock = r.br_startblock; + __entry->blockcount = r.br_blockcount; + __entry->state = r.br_state; + __entry->bmap_state = state; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " + "offset %lld block %s count %lld flag %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), + (long)__entry->idx, + __entry->startoff, + xfs_fmtfsblock(__entry->startblock), + __entry->blockcount, + __entry->state, + (char *)__entry->caller_ip) +) + #define DEFINE_BMAP_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_bmap_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_extnum_t idx, int state, \ unsigned long caller_ip), \ - TP_ARGS(ip, idx, state, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(xfs_extnum_t, idx) \ - __field(xfs_fileoff_t, startoff) \ - __field(xfs_fsblock_t, startblock) \ - __field(xfs_filblks_t, blockcount) \ - __field(xfs_exntst_t, state) \ - __field(int, bmap_state) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - struct xfs_ifork *ifp = (state & BMAP_ATTRFORK) ? \ - ip->i_afp : &ip->i_df; \ - struct xfs_bmbt_irec r; \ - \ - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r); \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->idx = idx; \ - __entry->startoff = r.br_startoff; \ - __entry->startblock = r.br_startblock; \ - __entry->blockcount = r.br_blockcount; \ - __entry->state = r.br_state; \ - __entry->bmap_state = state; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx state %s idx %ld " \ - "offset %lld block %s count %lld flag %d caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __print_flags(__entry->bmap_state, "|", XFS_BMAP_EXT_FLAGS), \ - (long)__entry->idx, \ - __entry->startoff, \ - xfs_fmtfsblock(__entry->startblock), \ - __entry->blockcount, \ - __entry->state, \ - (char *)__entry->caller_ip) \ -) - + TP_ARGS(ip, idx, state, caller_ip)) DEFINE_BMAP_EVENT(xfs_iext_remove); DEFINE_BMAP_EVENT(xfs_bmap_pre_update); DEFINE_BMAP_EVENT(xfs_bmap_post_update); DEFINE_BMAP_EVENT(xfs_extlist); -#define DEFINE_BUF_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ - TP_ARGS(bp, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_daddr_t, bno) \ - __field(size_t, buffer_length) \ - __field(int, hold) \ - __field(int, pincount) \ - __field(unsigned, lockval) \ - __field(unsigned, flags) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = bp->b_target->bt_dev; \ - __entry->bno = bp->b_bn; \ - __entry->buffer_length = bp->b_buffer_length; \ - __entry->hold = atomic_read(&bp->b_hold); \ - __entry->pincount = atomic_read(&bp->b_pin_count); \ - __entry->lockval = xfs_buf_lock_value(bp); \ - __entry->flags = bp->b_flags; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ - "lock %d flags %s caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - (unsigned long long)__entry->bno, \ - __entry->buffer_length, \ - __entry->hold, \ - __entry->pincount, \ - __entry->lockval, \ - __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \ - (void *)__entry->caller_ip) \ +DECLARE_EVENT_CLASS(xfs_buf_class, + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), + TP_ARGS(bp, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->flags = bp->b_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) ) + +#define DEFINE_BUF_EVENT(name) \ +DEFINE_EVENT(xfs_buf_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned long caller_ip), \ + TP_ARGS(bp, caller_ip)) DEFINE_BUF_EVENT(xfs_buf_init); DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); @@ -299,41 +311,45 @@ DEFINE_BUF_EVENT(xfs_reset_dqcounts); DEFINE_BUF_EVENT(xfs_inode_item_push); /* pass flags explicitly */ -#define DEFINE_BUF_FLAGS_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ - TP_ARGS(bp, flags, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_daddr_t, bno) \ - __field(size_t, buffer_length) \ - __field(int, hold) \ - __field(int, pincount) \ - __field(unsigned, lockval) \ - __field(unsigned, flags) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = bp->b_target->bt_dev; \ - __entry->bno = bp->b_bn; \ - __entry->buffer_length = bp->b_buffer_length; \ - __entry->flags = flags; \ - __entry->hold = atomic_read(&bp->b_hold); \ - __entry->pincount = atomic_read(&bp->b_pin_count); \ - __entry->lockval = xfs_buf_lock_value(bp); \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ - "lock %d flags %s caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - (unsigned long long)__entry->bno, \ - __entry->buffer_length, \ - __entry->hold, \ - __entry->pincount, \ - __entry->lockval, \ - __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), \ - (void *)__entry->caller_ip) \ +DECLARE_EVENT_CLASS(xfs_buf_flags_class, + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), + TP_ARGS(bp, flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, bno) + __field(size_t, buffer_length) + __field(int, hold) + __field(int, pincount) + __field(unsigned, lockval) + __field(unsigned, flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = bp->b_target->bt_dev; + __entry->bno = bp->b_bn; + __entry->buffer_length = bp->b_buffer_length; + __entry->flags = flags; + __entry->hold = atomic_read(&bp->b_hold); + __entry->pincount = atomic_read(&bp->b_pin_count); + __entry->lockval = xfs_buf_lock_value(bp); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->bno, + __entry->buffer_length, + __entry->hold, + __entry->pincount, + __entry->lockval, + __print_flags(__entry->flags, "|", XFS_BUF_FLAGS), + (void *)__entry->caller_ip) ) + +#define DEFINE_BUF_FLAGS_EVENT(name) \ +DEFINE_EVENT(xfs_buf_flags_class, name, \ + TP_PROTO(struct xfs_buf *bp, unsigned flags, unsigned long caller_ip), \ + TP_ARGS(bp, flags, caller_ip)) DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); @@ -376,55 +392,58 @@ TRACE_EVENT(xfs_buf_ioerror, (void *)__entry->caller_ip) ); -#define DEFINE_BUF_ITEM_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_buf_log_item *bip), \ - TP_ARGS(bip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_daddr_t, buf_bno) \ - __field(size_t, buf_len) \ - __field(int, buf_hold) \ - __field(int, buf_pincount) \ - __field(int, buf_lockval) \ - __field(unsigned, buf_flags) \ - __field(unsigned, bli_recur) \ - __field(int, bli_refcount) \ - __field(unsigned, bli_flags) \ - __field(void *, li_desc) \ - __field(unsigned, li_flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = bip->bli_buf->b_target->bt_dev; \ - __entry->bli_flags = bip->bli_flags; \ - __entry->bli_recur = bip->bli_recur; \ - __entry->bli_refcount = atomic_read(&bip->bli_refcount); \ - __entry->buf_bno = bip->bli_buf->b_bn; \ - __entry->buf_len = bip->bli_buf->b_buffer_length; \ - __entry->buf_flags = bip->bli_buf->b_flags; \ - __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); \ - __entry->buf_pincount = \ - atomic_read(&bip->bli_buf->b_pin_count); \ - __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); \ - __entry->li_desc = bip->bli_item.li_desc; \ - __entry->li_flags = bip->bli_item.li_flags; \ - ), \ - TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " \ - "lock %d flags %s recur %d refcount %d bliflags %s " \ - "lidesc 0x%p liflags %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - (unsigned long long)__entry->buf_bno, \ - __entry->buf_len, \ - __entry->buf_hold, \ - __entry->buf_pincount, \ - __entry->buf_lockval, \ - __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), \ - __entry->bli_recur, \ - __entry->bli_refcount, \ - __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), \ - __entry->li_desc, \ - __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) \ +DECLARE_EVENT_CLASS(xfs_buf_item_class, + TP_PROTO(struct xfs_buf_log_item *bip), + TP_ARGS(bip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_daddr_t, buf_bno) + __field(size_t, buf_len) + __field(int, buf_hold) + __field(int, buf_pincount) + __field(int, buf_lockval) + __field(unsigned, buf_flags) + __field(unsigned, bli_recur) + __field(int, bli_refcount) + __field(unsigned, bli_flags) + __field(void *, li_desc) + __field(unsigned, li_flags) + ), + TP_fast_assign( + __entry->dev = bip->bli_buf->b_target->bt_dev; + __entry->bli_flags = bip->bli_flags; + __entry->bli_recur = bip->bli_recur; + __entry->bli_refcount = atomic_read(&bip->bli_refcount); + __entry->buf_bno = bip->bli_buf->b_bn; + __entry->buf_len = bip->bli_buf->b_buffer_length; + __entry->buf_flags = bip->bli_buf->b_flags; + __entry->buf_hold = atomic_read(&bip->bli_buf->b_hold); + __entry->buf_pincount = atomic_read(&bip->bli_buf->b_pin_count); + __entry->buf_lockval = xfs_buf_lock_value(bip->bli_buf); + __entry->li_desc = bip->bli_item.li_desc; + __entry->li_flags = bip->bli_item.li_flags; + ), + TP_printk("dev %d:%d bno 0x%llx len 0x%zx hold %d pincount %d " + "lock %d flags %s recur %d refcount %d bliflags %s " + "lidesc 0x%p liflags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->buf_bno, + __entry->buf_len, + __entry->buf_hold, + __entry->buf_pincount, + __entry->buf_lockval, + __print_flags(__entry->buf_flags, "|", XFS_BUF_FLAGS), + __entry->bli_recur, + __entry->bli_refcount, + __print_flags(__entry->bli_flags, "|", XFS_BLI_FLAGS), + __entry->li_desc, + __print_flags(__entry->li_flags, "|", XFS_LI_FLAGS)) ) + +#define DEFINE_BUF_ITEM_EVENT(name) \ +DEFINE_EVENT(xfs_buf_item_class, name, \ + TP_PROTO(struct xfs_buf_log_item *bip), \ + TP_ARGS(bip)) DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_size_stale); DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format); @@ -450,78 +469,90 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold); DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release); DEFINE_BUF_ITEM_EVENT(xfs_trans_binval); +DECLARE_EVENT_CLASS(xfs_lock_class, + TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, + unsigned long caller_ip), + TP_ARGS(ip, lock_flags, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, lock_flags) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->lock_flags = lock_flags; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), + (void *)__entry->caller_ip) +) + #define DEFINE_LOCK_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_lock_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned lock_flags, \ unsigned long caller_ip), \ - TP_ARGS(ip, lock_flags, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(int, lock_flags) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->lock_flags = lock_flags; \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx flags %s caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __print_flags(__entry->lock_flags, "|", XFS_LOCK_FLAGS), \ - (void *)__entry->caller_ip) \ -) - + TP_ARGS(ip, lock_flags, caller_ip)) DEFINE_LOCK_EVENT(xfs_ilock); DEFINE_LOCK_EVENT(xfs_ilock_nowait); DEFINE_LOCK_EVENT(xfs_ilock_demote); DEFINE_LOCK_EVENT(xfs_iunlock); +DECLARE_EVENT_CLASS(xfs_iget_class, + TP_PROTO(struct xfs_inode *ip), + TP_ARGS(ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + ), + TP_printk("dev %d:%d ino 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino) +) + #define DEFINE_IGET_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_iget_class, name, \ TP_PROTO(struct xfs_inode *ip), \ - TP_ARGS(ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino) \ -) + TP_ARGS(ip)) DEFINE_IGET_EVENT(xfs_iget_skip); DEFINE_IGET_EVENT(xfs_iget_reclaim); DEFINE_IGET_EVENT(xfs_iget_found); DEFINE_IGET_EVENT(xfs_iget_alloc); +DECLARE_EVENT_CLASS(xfs_inode_class, + TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), + TP_ARGS(ip, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, count) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->count = atomic_read(&VFS_I(ip)->i_count); + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->count, + (char *)__entry->caller_ip) +) + #define DEFINE_INODE_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_inode_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ - TP_ARGS(ip, caller_ip), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(int, count) \ - __field(unsigned long, caller_ip) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->count = atomic_read(&VFS_I(ip)->i_count); \ - __entry->caller_ip = caller_ip; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx count %d caller %pf", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->count, \ - (char *)__entry->caller_ip) \ -) + TP_ARGS(ip, caller_ip)) DEFINE_INODE_EVENT(xfs_ihold); DEFINE_INODE_EVENT(xfs_irele); /* the old xfs_itrace_entry tracer - to be replaced by s.th. in the VFS */ @@ -529,55 +560,59 @@ DEFINE_INODE_EVENT(xfs_inode); #define xfs_itrace_entry(ip) \ trace_xfs_inode(ip, _THIS_IP_) -#define DEFINE_DQUOT_EVENT(tname) \ -TRACE_EVENT(tname, \ - TP_PROTO(struct xfs_dquot *dqp), \ - TP_ARGS(dqp), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(__be32, id) \ - __field(unsigned, flags) \ - __field(unsigned, nrefs) \ - __field(unsigned long long, res_bcount) \ - __field(unsigned long long, bcount) \ - __field(unsigned long long, icount) \ - __field(unsigned long long, blk_hardlimit) \ - __field(unsigned long long, blk_softlimit) \ - __field(unsigned long long, ino_hardlimit) \ - __field(unsigned long long, ino_softlimit) \ - ), \ - TP_fast_assign( \ - __entry->dev = dqp->q_mount->m_super->s_dev; \ - __entry->id = dqp->q_core.d_id; \ - __entry->flags = dqp->dq_flags; \ - __entry->nrefs = dqp->q_nrefs; \ - __entry->res_bcount = dqp->q_res_bcount; \ - __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); \ - __entry->icount = be64_to_cpu(dqp->q_core.d_icount); \ - __entry->blk_hardlimit = \ - be64_to_cpu(dqp->q_core.d_blk_hardlimit); \ - __entry->blk_softlimit = \ - be64_to_cpu(dqp->q_core.d_blk_softlimit); \ - __entry->ino_hardlimit = \ - be64_to_cpu(dqp->q_core.d_ino_hardlimit); \ - __entry->ino_softlimit = \ - be64_to_cpu(dqp->q_core.d_ino_softlimit); \ +DECLARE_EVENT_CLASS(xfs_dquot_class, + TP_PROTO(struct xfs_dquot *dqp), + TP_ARGS(dqp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(__be32, id) + __field(unsigned, flags) + __field(unsigned, nrefs) + __field(unsigned long long, res_bcount) + __field(unsigned long long, bcount) + __field(unsigned long long, icount) + __field(unsigned long long, blk_hardlimit) + __field(unsigned long long, blk_softlimit) + __field(unsigned long long, ino_hardlimit) + __field(unsigned long long, ino_softlimit) ), \ - TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " \ - "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " \ - "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - be32_to_cpu(__entry->id), \ - __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), \ - __entry->nrefs, \ - __entry->res_bcount, \ - __entry->bcount, \ - __entry->blk_hardlimit, \ - __entry->blk_softlimit, \ - __entry->icount, \ - __entry->ino_hardlimit, \ - __entry->ino_softlimit) \ + TP_fast_assign( + __entry->dev = dqp->q_mount->m_super->s_dev; + __entry->id = dqp->q_core.d_id; + __entry->flags = dqp->dq_flags; + __entry->nrefs = dqp->q_nrefs; + __entry->res_bcount = dqp->q_res_bcount; + __entry->bcount = be64_to_cpu(dqp->q_core.d_bcount); + __entry->icount = be64_to_cpu(dqp->q_core.d_icount); + __entry->blk_hardlimit = + be64_to_cpu(dqp->q_core.d_blk_hardlimit); + __entry->blk_softlimit = + be64_to_cpu(dqp->q_core.d_blk_softlimit); + __entry->ino_hardlimit = + be64_to_cpu(dqp->q_core.d_ino_hardlimit); + __entry->ino_softlimit = + be64_to_cpu(dqp->q_core.d_ino_softlimit); + ), + TP_printk("dev %d:%d id 0x%x flags %s nrefs %u res_bc 0x%llx " + "bcnt 0x%llx [hard 0x%llx | soft 0x%llx] " + "icnt 0x%llx [hard 0x%llx | soft 0x%llx]", + MAJOR(__entry->dev), MINOR(__entry->dev), + be32_to_cpu(__entry->id), + __print_flags(__entry->flags, "|", XFS_DQ_FLAGS), + __entry->nrefs, + __entry->res_bcount, + __entry->bcount, + __entry->blk_hardlimit, + __entry->blk_softlimit, + __entry->icount, + __entry->ino_hardlimit, + __entry->ino_softlimit) ) + +#define DEFINE_DQUOT_EVENT(name) \ +DEFINE_EVENT(xfs_dquot_class, name, \ + TP_PROTO(struct xfs_dquot *dqp), \ + TP_ARGS(dqp)) DEFINE_DQUOT_EVENT(xfs_dqadjust); DEFINE_DQUOT_EVENT(xfs_dqshake_dirty); DEFINE_DQUOT_EVENT(xfs_dqshake_unlink); @@ -610,72 +645,75 @@ DEFINE_DQUOT_EVENT(xfs_dqflush_done); DEFINE_IGET_EVENT(xfs_dquot_dqalloc); DEFINE_IGET_EVENT(xfs_dquot_dqdetach); +DECLARE_EVENT_CLASS(xfs_loggrant_class, + TP_PROTO(struct log *log, struct xlog_ticket *tic), + TP_ARGS(log, tic), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned, trans_type) + __field(char, ocnt) + __field(char, cnt) + __field(int, curr_res) + __field(int, unit_res) + __field(unsigned int, flags) + __field(void *, reserve_headq) + __field(void *, write_headq) + __field(int, grant_reserve_cycle) + __field(int, grant_reserve_bytes) + __field(int, grant_write_cycle) + __field(int, grant_write_bytes) + __field(int, curr_cycle) + __field(int, curr_block) + __field(xfs_lsn_t, tail_lsn) + ), + TP_fast_assign( + __entry->dev = log->l_mp->m_super->s_dev; + __entry->trans_type = tic->t_trans_type; + __entry->ocnt = tic->t_ocnt; + __entry->cnt = tic->t_cnt; + __entry->curr_res = tic->t_curr_res; + __entry->unit_res = tic->t_unit_res; + __entry->flags = tic->t_flags; + __entry->reserve_headq = log->l_reserve_headq; + __entry->write_headq = log->l_write_headq; + __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; + __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; + __entry->grant_write_cycle = log->l_grant_write_cycle; + __entry->grant_write_bytes = log->l_grant_write_bytes; + __entry->curr_cycle = log->l_curr_cycle; + __entry->curr_block = log->l_curr_block; + __entry->tail_lsn = log->l_tail_lsn; + ), + TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " + "t_unit_res %u t_flags %s reserve_headq 0x%p " + "write_headq 0x%p grant_reserve_cycle %d " + "grant_reserve_bytes %d grant_write_cycle %d " + "grant_write_bytes %d curr_cycle %d curr_block %d " + "tail_cycle %d tail_block %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), + __entry->ocnt, + __entry->cnt, + __entry->curr_res, + __entry->unit_res, + __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), + __entry->reserve_headq, + __entry->write_headq, + __entry->grant_reserve_cycle, + __entry->grant_reserve_bytes, + __entry->grant_write_cycle, + __entry->grant_write_bytes, + __entry->curr_cycle, + __entry->curr_block, + CYCLE_LSN(__entry->tail_lsn), + BLOCK_LSN(__entry->tail_lsn) + ) +) -#define DEFINE_LOGGRANT_EVENT(tname) \ -TRACE_EVENT(tname, \ +#define DEFINE_LOGGRANT_EVENT(name) \ +DEFINE_EVENT(xfs_loggrant_class, name, \ TP_PROTO(struct log *log, struct xlog_ticket *tic), \ - TP_ARGS(log, tic), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(unsigned, trans_type) \ - __field(char, ocnt) \ - __field(char, cnt) \ - __field(int, curr_res) \ - __field(int, unit_res) \ - __field(unsigned int, flags) \ - __field(void *, reserve_headq) \ - __field(void *, write_headq) \ - __field(int, grant_reserve_cycle) \ - __field(int, grant_reserve_bytes) \ - __field(int, grant_write_cycle) \ - __field(int, grant_write_bytes) \ - __field(int, curr_cycle) \ - __field(int, curr_block) \ - __field(xfs_lsn_t, tail_lsn) \ - ), \ - TP_fast_assign( \ - __entry->dev = log->l_mp->m_super->s_dev; \ - __entry->trans_type = tic->t_trans_type; \ - __entry->ocnt = tic->t_ocnt; \ - __entry->cnt = tic->t_cnt; \ - __entry->curr_res = tic->t_curr_res; \ - __entry->unit_res = tic->t_unit_res; \ - __entry->flags = tic->t_flags; \ - __entry->reserve_headq = log->l_reserve_headq; \ - __entry->write_headq = log->l_write_headq; \ - __entry->grant_reserve_cycle = log->l_grant_reserve_cycle; \ - __entry->grant_reserve_bytes = log->l_grant_reserve_bytes; \ - __entry->grant_write_cycle = log->l_grant_write_cycle; \ - __entry->grant_write_bytes = log->l_grant_write_bytes; \ - __entry->curr_cycle = log->l_curr_cycle; \ - __entry->curr_block = log->l_curr_block; \ - __entry->tail_lsn = log->l_tail_lsn; \ - ), \ - TP_printk("dev %d:%d type %s t_ocnt %u t_cnt %u t_curr_res %u " \ - "t_unit_res %u t_flags %s reserve_headq 0x%p " \ - "write_headq 0x%p grant_reserve_cycle %d " \ - "grant_reserve_bytes %d grant_write_cycle %d " \ - "grant_write_bytes %d curr_cycle %d curr_block %d " \ - "tail_cycle %d tail_block %d", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __print_symbolic(__entry->trans_type, XFS_TRANS_TYPES), \ - __entry->ocnt, \ - __entry->cnt, \ - __entry->curr_res, \ - __entry->unit_res, \ - __print_flags(__entry->flags, "|", XLOG_TIC_FLAGS), \ - __entry->reserve_headq, \ - __entry->write_headq, \ - __entry->grant_reserve_cycle, \ - __entry->grant_reserve_bytes, \ - __entry->grant_write_cycle, \ - __entry->grant_write_bytes, \ - __entry->curr_cycle, \ - __entry->curr_block, \ - CYCLE_LSN(__entry->tail_lsn), \ - BLOCK_LSN(__entry->tail_lsn) \ - ) \ -) + TP_ARGS(log, tic)) DEFINE_LOGGRANT_EVENT(xfs_log_done_nonperm); DEFINE_LOGGRANT_EVENT(xfs_log_done_perm); DEFINE_LOGGRANT_EVENT(xfs_log_reserve); @@ -815,7 +853,7 @@ TRACE_EVENT(name, \ ), \ TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx " \ "offset 0x%llx count %zd flags %s " \ - "startoff 0x%llx startblock 0x%llx blockcount 0x%llx", \ + "startoff 0x%llx startblock %s blockcount 0x%llx", \ MAJOR(__entry->dev), MINOR(__entry->dev), \ __entry->ino, \ __entry->size, \ @@ -824,7 +862,7 @@ TRACE_EVENT(name, \ __entry->count, \ __print_flags(__entry->flags, "|", BMAPI_FLAGS), \ __entry->startoff, \ - __entry->startblock, \ + xfs_fmtfsblock(__entry->startblock), \ __entry->blockcount) \ ) DEFINE_IOMAP_EVENT(xfs_iomap_enter); @@ -897,28 +935,32 @@ TRACE_EVENT(xfs_itruncate_start, __entry->toss_finish) ); +DECLARE_EVENT_CLASS(xfs_itrunc_class, + TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), + TP_ARGS(ip, new_size), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_fsize_t, size) + __field(xfs_fsize_t, new_size) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->size = ip->i_d.di_size; + __entry->new_size = new_size; + ), + TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->size, + __entry->new_size) +) + #define DEFINE_ITRUNC_EVENT(name) \ -TRACE_EVENT(name, \ +DEFINE_EVENT(xfs_itrunc_class, name, \ TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), \ - TP_ARGS(ip, new_size), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(xfs_fsize_t, size) \ - __field(xfs_fsize_t, new_size) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(ip)->i_sb->s_dev; \ - __entry->ino = ip->i_ino; \ - __entry->size = ip->i_d.di_size; \ - __entry->new_size = new_size; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx size 0x%llx new_size 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->size, \ - __entry->new_size) \ -) + TP_ARGS(ip, new_size)) DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_start); DEFINE_ITRUNC_EVENT(xfs_itruncate_finish_end); @@ -1037,28 +1079,28 @@ TRACE_EVENT(xfs_alloc_unbusy, TRACE_EVENT(xfs_alloc_busysearch, TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agblock_t agbno, - xfs_extlen_t len, int found), - TP_ARGS(mp, agno, agbno, len, found), + xfs_extlen_t len, xfs_lsn_t lsn), + TP_ARGS(mp, agno, agbno, len, lsn), TP_STRUCT__entry( __field(dev_t, dev) __field(xfs_agnumber_t, agno) __field(xfs_agblock_t, agbno) __field(xfs_extlen_t, len) - __field(int, found) + __field(xfs_lsn_t, lsn) ), TP_fast_assign( __entry->dev = mp->m_super->s_dev; __entry->agno = agno; __entry->agbno = agbno; __entry->len = len; - __entry->found = found; + __entry->lsn = lsn; ), - TP_printk("dev %d:%d agno %u agbno %u len %u %s", + TP_printk("dev %d:%d agno %u agbno %u len %u force lsn 0x%llx", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, __entry->agbno, __entry->len, - __print_symbolic(__entry->found, XFS_BUSY_STATES)) + __entry->lsn) ); TRACE_EVENT(xfs_agf, @@ -1152,77 +1194,80 @@ TRACE_EVENT(xfs_free_extent, ); -#define DEFINE_ALLOC_EVENT(name) \ -TRACE_EVENT(name, \ - TP_PROTO(struct xfs_alloc_arg *args), \ - TP_ARGS(args), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_agnumber_t, agno) \ - __field(xfs_agblock_t, agbno) \ - __field(xfs_extlen_t, minlen) \ - __field(xfs_extlen_t, maxlen) \ - __field(xfs_extlen_t, mod) \ - __field(xfs_extlen_t, prod) \ - __field(xfs_extlen_t, minleft) \ - __field(xfs_extlen_t, total) \ - __field(xfs_extlen_t, alignment) \ - __field(xfs_extlen_t, minalignslop) \ - __field(xfs_extlen_t, len) \ - __field(short, type) \ - __field(short, otype) \ - __field(char, wasdel) \ - __field(char, wasfromfl) \ - __field(char, isfl) \ - __field(char, userdata) \ - __field(xfs_fsblock_t, firstblock) \ - ), \ - TP_fast_assign( \ - __entry->dev = args->mp->m_super->s_dev; \ - __entry->agno = args->agno; \ - __entry->agbno = args->agbno; \ - __entry->minlen = args->minlen; \ - __entry->maxlen = args->maxlen; \ - __entry->mod = args->mod; \ - __entry->prod = args->prod; \ - __entry->minleft = args->minleft; \ - __entry->total = args->total; \ - __entry->alignment = args->alignment; \ - __entry->minalignslop = args->minalignslop; \ - __entry->len = args->len; \ - __entry->type = args->type; \ - __entry->otype = args->otype; \ - __entry->wasdel = args->wasdel; \ - __entry->wasfromfl = args->wasfromfl; \ - __entry->isfl = args->isfl; \ - __entry->userdata = args->userdata; \ - __entry->firstblock = args->firstblock; \ - ), \ - TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " \ - "prod %u minleft %u total %u alignment %u minalignslop %u " \ - "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " \ - "userdata %d firstblock 0x%llx", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->agno, \ - __entry->agbno, \ - __entry->minlen, \ - __entry->maxlen, \ - __entry->mod, \ - __entry->prod, \ - __entry->minleft, \ - __entry->total, \ - __entry->alignment, \ - __entry->minalignslop, \ - __entry->len, \ - __print_symbolic(__entry->type, XFS_ALLOC_TYPES), \ - __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), \ - __entry->wasdel, \ - __entry->wasfromfl, \ - __entry->isfl, \ - __entry->userdata, \ - __entry->firstblock) \ +DECLARE_EVENT_CLASS(xfs_alloc_class, + TP_PROTO(struct xfs_alloc_arg *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agblock_t, agbno) + __field(xfs_extlen_t, minlen) + __field(xfs_extlen_t, maxlen) + __field(xfs_extlen_t, mod) + __field(xfs_extlen_t, prod) + __field(xfs_extlen_t, minleft) + __field(xfs_extlen_t, total) + __field(xfs_extlen_t, alignment) + __field(xfs_extlen_t, minalignslop) + __field(xfs_extlen_t, len) + __field(short, type) + __field(short, otype) + __field(char, wasdel) + __field(char, wasfromfl) + __field(char, isfl) + __field(char, userdata) + __field(xfs_fsblock_t, firstblock) + ), + TP_fast_assign( + __entry->dev = args->mp->m_super->s_dev; + __entry->agno = args->agno; + __entry->agbno = args->agbno; + __entry->minlen = args->minlen; + __entry->maxlen = args->maxlen; + __entry->mod = args->mod; + __entry->prod = args->prod; + __entry->minleft = args->minleft; + __entry->total = args->total; + __entry->alignment = args->alignment; + __entry->minalignslop = args->minalignslop; + __entry->len = args->len; + __entry->type = args->type; + __entry->otype = args->otype; + __entry->wasdel = args->wasdel; + __entry->wasfromfl = args->wasfromfl; + __entry->isfl = args->isfl; + __entry->userdata = args->userdata; + __entry->firstblock = args->firstblock; + ), + TP_printk("dev %d:%d agno %u agbno %u minlen %u maxlen %u mod %u " + "prod %u minleft %u total %u alignment %u minalignslop %u " + "len %u type %s otype %s wasdel %d wasfromfl %d isfl %d " + "userdata %d firstblock 0x%llx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->agno, + __entry->agbno, + __entry->minlen, + __entry->maxlen, + __entry->mod, + __entry->prod, + __entry->minleft, + __entry->total, + __entry->alignment, + __entry->minalignslop, + __entry->len, + __print_symbolic(__entry->type, XFS_ALLOC_TYPES), + __print_symbolic(__entry->otype, XFS_ALLOC_TYPES), + __entry->wasdel, + __entry->wasfromfl, + __entry->isfl, + __entry->userdata, + __entry->firstblock) ) +#define DEFINE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_alloc_class, name, \ + TP_PROTO(struct xfs_alloc_arg *args), \ + TP_ARGS(args)) DEFINE_ALLOC_EVENT(xfs_alloc_exact_done); DEFINE_ALLOC_EVENT(xfs_alloc_exact_error); DEFINE_ALLOC_EVENT(xfs_alloc_near_nominleft); @@ -1245,92 +1290,100 @@ DEFINE_ALLOC_EVENT(xfs_alloc_vextent_noagbp); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_loopfailed); DEFINE_ALLOC_EVENT(xfs_alloc_vextent_allfailed); -#define DEFINE_DIR2_TRACE(tname) \ -TRACE_EVENT(tname, \ +DECLARE_EVENT_CLASS(xfs_dir2_class, + TP_PROTO(struct xfs_da_args *args), + TP_ARGS(args), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __dynamic_array(char, name, args->namelen) + __field(int, namelen) + __field(xfs_dahash_t, hashval) + __field(xfs_ino_t, inumber) + __field(int, op_flags) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + if (args->namelen) + memcpy(__get_str(name), args->name, args->namelen); + __entry->namelen = args->namelen; + __entry->hashval = args->hashval; + __entry->inumber = args->inumber; + __entry->op_flags = args->op_flags; + ), + TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " + "inumber 0x%llx op_flags %s", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->namelen, + __entry->namelen ? __get_str(name) : NULL, + __entry->namelen, + __entry->hashval, + __entry->inumber, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) +) + +#define DEFINE_DIR2_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_class, name, \ TP_PROTO(struct xfs_da_args *args), \ - TP_ARGS(args), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __dynamic_array(char, name, args->namelen) \ - __field(int, namelen) \ - __field(xfs_dahash_t, hashval) \ - __field(xfs_ino_t, inumber) \ - __field(int, op_flags) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(args->dp)->i_sb->s_dev; \ - __entry->ino = args->dp->i_ino; \ - if (args->namelen) \ - memcpy(__get_str(name), args->name, args->namelen); \ - __entry->namelen = args->namelen; \ - __entry->hashval = args->hashval; \ - __entry->inumber = args->inumber; \ - __entry->op_flags = args->op_flags; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx name %.*s namelen %d hashval 0x%x " \ - "inumber 0x%llx op_flags %s", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __entry->namelen, \ - __entry->namelen ? __get_str(name) : NULL, \ - __entry->namelen, \ - __entry->hashval, \ - __entry->inumber, \ - __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS)) \ + TP_ARGS(args)) +DEFINE_DIR2_EVENT(xfs_dir2_sf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_sf_create); +DEFINE_DIR2_EVENT(xfs_dir2_sf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_sf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_sf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino4); +DEFINE_DIR2_EVENT(xfs_dir2_sf_toino8); +DEFINE_DIR2_EVENT(xfs_dir2_sf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_block_addname); +DEFINE_DIR2_EVENT(xfs_dir2_block_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_block_replace); +DEFINE_DIR2_EVENT(xfs_dir2_block_removename); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_sf); +DEFINE_DIR2_EVENT(xfs_dir2_block_to_leaf); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_addname); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_replace); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_removename); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_block); +DEFINE_DIR2_EVENT(xfs_dir2_leaf_to_node); +DEFINE_DIR2_EVENT(xfs_dir2_node_addname); +DEFINE_DIR2_EVENT(xfs_dir2_node_lookup); +DEFINE_DIR2_EVENT(xfs_dir2_node_replace); +DEFINE_DIR2_EVENT(xfs_dir2_node_removename); +DEFINE_DIR2_EVENT(xfs_dir2_node_to_leaf); + +DECLARE_EVENT_CLASS(xfs_dir2_space_class, + TP_PROTO(struct xfs_da_args *args, int idx), + TP_ARGS(args, idx), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, op_flags) + __field(int, idx) + ), + TP_fast_assign( + __entry->dev = VFS_I(args->dp)->i_sb->s_dev; + __entry->ino = args->dp->i_ino; + __entry->op_flags = args->op_flags; + __entry->idx = idx; + ), + TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), + __entry->idx) ) -DEFINE_DIR2_TRACE(xfs_dir2_sf_addname); -DEFINE_DIR2_TRACE(xfs_dir2_sf_create); -DEFINE_DIR2_TRACE(xfs_dir2_sf_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_sf_replace); -DEFINE_DIR2_TRACE(xfs_dir2_sf_removename); -DEFINE_DIR2_TRACE(xfs_dir2_sf_toino4); -DEFINE_DIR2_TRACE(xfs_dir2_sf_toino8); -DEFINE_DIR2_TRACE(xfs_dir2_sf_to_block); -DEFINE_DIR2_TRACE(xfs_dir2_block_addname); -DEFINE_DIR2_TRACE(xfs_dir2_block_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_block_replace); -DEFINE_DIR2_TRACE(xfs_dir2_block_removename); -DEFINE_DIR2_TRACE(xfs_dir2_block_to_sf); -DEFINE_DIR2_TRACE(xfs_dir2_block_to_leaf); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_addname); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_replace); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_removename); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_block); -DEFINE_DIR2_TRACE(xfs_dir2_leaf_to_node); -DEFINE_DIR2_TRACE(xfs_dir2_node_addname); -DEFINE_DIR2_TRACE(xfs_dir2_node_lookup); -DEFINE_DIR2_TRACE(xfs_dir2_node_replace); -DEFINE_DIR2_TRACE(xfs_dir2_node_removename); -DEFINE_DIR2_TRACE(xfs_dir2_node_to_leaf); -#define DEFINE_DIR2_SPACE_TRACE(tname) \ -TRACE_EVENT(tname, \ +#define DEFINE_DIR2_SPACE_EVENT(name) \ +DEFINE_EVENT(xfs_dir2_space_class, name, \ TP_PROTO(struct xfs_da_args *args, int idx), \ - TP_ARGS(args, idx), \ - TP_STRUCT__entry( \ - __field(dev_t, dev) \ - __field(xfs_ino_t, ino) \ - __field(int, op_flags) \ - __field(int, idx) \ - ), \ - TP_fast_assign( \ - __entry->dev = VFS_I(args->dp)->i_sb->s_dev; \ - __entry->ino = args->dp->i_ino; \ - __entry->op_flags = args->op_flags; \ - __entry->idx = idx; \ - ), \ - TP_printk("dev %d:%d ino 0x%llx op_flags %s index %d", \ - MAJOR(__entry->dev), MINOR(__entry->dev), \ - __entry->ino, \ - __print_flags(__entry->op_flags, "|", XFS_DA_OP_FLAGS), \ - __entry->idx) \ -) -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_add); -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_leafn_remove); -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_grow_inode); -DEFINE_DIR2_SPACE_TRACE(xfs_dir2_shrink_inode); + TP_ARGS(args, idx)) +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_add); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_leafn_remove); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_grow_inode); +DEFINE_DIR2_SPACE_EVENT(xfs_dir2_shrink_inode); TRACE_EVENT(xfs_dir2_leafn_moveents, TP_PROTO(struct xfs_da_args *args, int src_idx, int dst_idx, int count), diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 71af76fe8a2..873e07e2907 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -891,7 +891,7 @@ xfs_qm_dqrele_all_inodes( uint flags) { ASSERT(mp->m_quotainfo); - xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG); + xfs_inode_ag_iterator(mp, xfs_dqrele_inode, flags, XFS_ICI_NO_TAG, 0); } /*------------------------------------------------------------------------*/ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index a1c65fc6d9c..275b1f4f943 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -2563,43 +2563,41 @@ xfs_alloc_search_busy(xfs_trans_t *tp, xfs_mount_t *mp; xfs_perag_busy_t *bsy; xfs_agblock_t uend, bend; - xfs_lsn_t lsn; + xfs_lsn_t lsn = 0; int cnt; mp = tp->t_mountp; spin_lock(&mp->m_perag[agno].pagb_lock); - cnt = mp->m_perag[agno].pagb_count; uend = bno + len - 1; - /* search pagb_list for this slot, skipping open slots */ - for (bsy = mp->m_perag[agno].pagb_list; cnt; bsy++) { + /* + * search pagb_list for this slot, skipping open slots. We have to + * search the entire array as there may be multiple overlaps and + * we have to get the most recent LSN for the log force to push out + * all the transactions that span the range. + */ + for (cnt = 0; cnt < mp->m_perag[agno].pagb_count; cnt++) { + bsy = &mp->m_perag[agno].pagb_list[cnt]; + if (!bsy->busy_tp) + continue; - /* - * (start1,length1) within (start2, length2) - */ - if (bsy->busy_tp != NULL) { - bend = bsy->busy_start + bsy->busy_length - 1; - if ((bno > bend) || (uend < bsy->busy_start)) { - cnt--; - } else { - break; - } - } - } + bend = bsy->busy_start + bsy->busy_length - 1; + if (bno > bend || uend < bsy->busy_start) + continue; - trace_xfs_alloc_busysearch(mp, agno, bno, len, !!cnt); + /* (start1,length1) within (start2, length2) */ + if (XFS_LSN_CMP(bsy->busy_tp->t_commit_lsn, lsn) > 0) + lsn = bsy->busy_tp->t_commit_lsn; + } + spin_unlock(&mp->m_perag[agno].pagb_lock); + trace_xfs_alloc_busysearch(tp->t_mountp, agno, bno, len, lsn); /* * If a block was found, force the log through the LSN of the * transaction that freed the block */ - if (cnt) { - lsn = bsy->busy_tp->t_commit_lsn; - spin_unlock(&mp->m_perag[agno].pagb_lock); + if (lsn) xfs_log_force(mp, lsn, XFS_LOG_FORCE|XFS_LOG_SYNC); - } else { - spin_unlock(&mp->m_perag[agno].pagb_lock); - } } diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index d1483a4f71b..84ca1cf16a1 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -114,10 +114,82 @@ xfs_swapext( return error; } +/* + * We need to check that the format of the data fork in the temporary inode is + * valid for the target inode before doing the swap. This is not a problem with + * attr1 because of the fixed fork offset, but attr2 has a dynamically sized + * data fork depending on the space the attribute fork is taking so we can get + * invalid formats on the target inode. + * + * E.g. target has space for 7 extents in extent format, temp inode only has + * space for 6. If we defragment down to 7 extents, then the tmp format is a + * btree, but when swapped it needs to be in extent format. Hence we can't just + * blindly swap data forks on attr2 filesystems. + * + * Note that we check the swap in both directions so that we don't end up with + * a corrupt temporary inode, either. + * + * Note that fixing the way xfs_fsr sets up the attribute fork in the source + * inode will prevent this situation from occurring, so all we do here is + * reject and log the attempt. basically we are putting the responsibility on + * userspace to get this right. + */ +static int +xfs_swap_extents_check_format( + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip) /* tmp inode */ +{ + + /* Should never get a local format */ + if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || + tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) + return EINVAL; + + /* + * if the target inode has less extents that then temporary inode then + * why did userspace call us? + */ + if (ip->i_d.di_nextents < tip->i_d.di_nextents) + return EINVAL; + + /* + * if the target inode is in extent form and the temp inode is in btree + * form then we will end up with the target inode in the wrong format + * as we already know there are less extents in the temp inode. + */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + tip->i_d.di_format == XFS_DINODE_FMT_BTREE) + return EINVAL; + + /* Check temp in extent form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK) > ip->i_df.if_ext_max) + return EINVAL; + + /* Check target in extent form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_EXTENTS && + XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK) > tip->i_df.if_ext_max) + return EINVAL; + + /* Check root block of temp in btree form to max in target */ + if (tip->i_d.di_format == XFS_DINODE_FMT_BTREE && + XFS_IFORK_BOFF(ip) && + tip->i_df.if_broot_bytes > XFS_IFORK_BOFF(ip)) + return EINVAL; + + /* Check root block of target in btree form to max in temp */ + if (ip->i_d.di_format == XFS_DINODE_FMT_BTREE && + XFS_IFORK_BOFF(tip) && + ip->i_df.if_broot_bytes > XFS_IFORK_BOFF(tip)) + return EINVAL; + + return 0; +} + int xfs_swap_extents( - xfs_inode_t *ip, - xfs_inode_t *tip, + xfs_inode_t *ip, /* target inode */ + xfs_inode_t *tip, /* tmp inode */ xfs_swapext_t *sxp) { xfs_mount_t *mp; @@ -161,13 +233,6 @@ xfs_swap_extents( goto out_unlock; } - /* Should never get a local format */ - if (ip->i_d.di_format == XFS_DINODE_FMT_LOCAL || - tip->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - error = XFS_ERROR(EINVAL); - goto out_unlock; - } - if (VN_CACHED(VFS_I(tip)) != 0) { error = xfs_flushinval_pages(tip, 0, -1, FI_REMAPF_LOCKED); @@ -189,13 +254,12 @@ xfs_swap_extents( goto out_unlock; } - /* - * If the target has extended attributes, the tmp file - * must also in order to ensure the correct data fork - * format. - */ - if ( XFS_IFORK_Q(ip) != XFS_IFORK_Q(tip) ) { - error = XFS_ERROR(EINVAL); + /* check inode formats now that data is flushed */ + error = xfs_swap_extents_check_format(ip, tip); + if (error) { + xfs_fs_cmn_err(CE_NOTE, mp, + "%s: inode 0x%llx format is incompatible for exchanging.", + __FILE__, ip->i_ino); goto out_unlock; } @@ -276,6 +340,16 @@ xfs_swap_extents( *tifp = *tempifp; /* struct copy */ /* + * Fix the in-memory data fork values that are dependent on the fork + * offset in the inode. We can't assume they remain the same as attr2 + * has dynamic fork offsets. + */ + ifp->if_ext_max = XFS_IFORK_SIZE(ip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + tifp->if_ext_max = XFS_IFORK_SIZE(tip, XFS_DATA_FORK) / + (uint)sizeof(xfs_bmbt_rec_t); + + /* * Fix the on-disk inode values */ tmp = (__uint64_t)ip->i_d.di_nblocks; diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index fa402a6bbbc..155e798f30a 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -73,7 +73,6 @@ xfs_inode_alloc( ASSERT(atomic_read(&ip->i_pincount) == 0); ASSERT(!spin_is_locked(&ip->i_flags_lock)); ASSERT(completion_done(&ip->i_flush)); - ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock)); mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino); diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ce278b3ae7f..ef77fd88c8e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -2841,8 +2841,8 @@ xfs_iflush( mp = ip->i_mount; /* - * If the inode isn't dirty, then just release the inode - * flush lock and do nothing. + * If the inode isn't dirty, then just release the inode flush lock and + * do nothing. */ if (xfs_inode_clean(ip)) { xfs_ifunlock(ip); @@ -2868,6 +2868,19 @@ xfs_iflush( xfs_iunpin_wait(ip); /* + * For stale inodes we cannot rely on the backing buffer remaining + * stale in cache for the remaining life of the stale inode and so + * xfs_itobp() below may give us a buffer that no longer contains + * inodes below. We have to check this after ensuring the inode is + * unpinned so that it is safe to reclaim the stale inode after the + * flush call. + */ + if (xfs_iflags_test(ip, XFS_ISTALE)) { + xfs_ifunlock(ip); + return 0; + } + + /* * This may have been unpinned because the filesystem is shutting * down forcibly. If that's the case we must not write this inode * to disk, because the log record didn't make it to disk! diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 9e15a118536..6be05f756d5 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1517,6 +1517,8 @@ xfs_rtfree_range( */ error = xfs_rtfind_forw(mp, tp, end, mp->m_sb.sb_rextents - 1, &postblock); + if (error) + return error; /* * If there are blocks not being freed at the front of the * old extent, add summary data for them to be allocated. diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6558ffd8d14..6f268756bf3 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -70,7 +70,6 @@ xfs_setattr( uint commit_flags=0; uid_t uid=0, iuid=0; gid_t gid=0, igid=0; - int timeflags = 0; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int need_iolock = 1; @@ -135,16 +134,13 @@ xfs_setattr( if (flags & XFS_ATTR_NOLOCK) need_iolock = 0; if (!(mask & ATTR_SIZE)) { - if ((mask != (ATTR_CTIME|ATTR_ATIME|ATTR_MTIME)) || - (mp->m_flags & XFS_MOUNT_WSYNC)) { - tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); - commit_flags = 0; - if ((code = xfs_trans_reserve(tp, 0, - XFS_ICHANGE_LOG_RES(mp), 0, - 0, 0))) { - lock_flags = 0; - goto error_return; - } + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); + commit_flags = 0; + code = xfs_trans_reserve(tp, 0, XFS_ICHANGE_LOG_RES(mp), + 0, 0, 0); + if (code) { + lock_flags = 0; + goto error_return; } } else { if (DM_EVENT_ENABLED(ip, DM_EVENT_TRUNCATE) && @@ -295,15 +291,23 @@ xfs_setattr( * or we are explicitly asked to change it. This handles * the semantic difference between truncate() and ftruncate() * as implemented in the VFS. + * + * The regular truncate() case without ATTR_CTIME and ATTR_MTIME + * is a special case where we need to update the times despite + * not having these flags set. For all other operations the + * VFS set these flags explicitly if it wants a timestamp + * update. */ - if (iattr->ia_size != ip->i_size || (mask & ATTR_CTIME)) - timeflags |= XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG; + if (iattr->ia_size != ip->i_size && + (!(mask & (ATTR_CTIME | ATTR_MTIME)))) { + iattr->ia_ctime = iattr->ia_mtime = + current_fs_time(inode->i_sb); + mask |= ATTR_CTIME | ATTR_MTIME; + } if (iattr->ia_size > ip->i_size) { ip->i_d.di_size = iattr->ia_size; ip->i_size = iattr->ia_size; - if (!(flags & XFS_ATTR_DMI)) - xfs_ichgtime(ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } else if (iattr->ia_size <= ip->i_size || (iattr->ia_size == 0 && ip->i_d.di_nextents)) { @@ -374,9 +378,6 @@ xfs_setattr( ip->i_d.di_gid = gid; inode->i_gid = gid; } - - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; } /* @@ -393,51 +394,37 @@ xfs_setattr( inode->i_mode &= S_IFMT; inode->i_mode |= mode & ~S_IFMT; - - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - timeflags |= XFS_ICHGTIME_CHG; } /* * Change file access or modified times. */ - if (mask & (ATTR_ATIME|ATTR_MTIME)) { - if (mask & ATTR_ATIME) { - inode->i_atime = iattr->ia_atime; - ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; - ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; - ip->i_update_core = 1; - } - if (mask & ATTR_MTIME) { - inode->i_mtime = iattr->ia_mtime; - ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; - ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; - timeflags &= ~XFS_ICHGTIME_MOD; - timeflags |= XFS_ICHGTIME_CHG; - } - if (tp && (mask & (ATTR_MTIME_SET|ATTR_ATIME_SET))) - xfs_trans_log_inode (tp, ip, XFS_ILOG_CORE); + if (mask & ATTR_ATIME) { + inode->i_atime = iattr->ia_atime; + ip->i_d.di_atime.t_sec = iattr->ia_atime.tv_sec; + ip->i_d.di_atime.t_nsec = iattr->ia_atime.tv_nsec; + ip->i_update_core = 1; } - - /* - * Change file inode change time only if ATTR_CTIME set - * AND we have been called by a DMI function. - */ - - if ((flags & XFS_ATTR_DMI) && (mask & ATTR_CTIME)) { + if (mask & ATTR_CTIME) { inode->i_ctime = iattr->ia_ctime; ip->i_d.di_ctime.t_sec = iattr->ia_ctime.tv_sec; ip->i_d.di_ctime.t_nsec = iattr->ia_ctime.tv_nsec; ip->i_update_core = 1; - timeflags &= ~XFS_ICHGTIME_CHG; + } + if (mask & ATTR_MTIME) { + inode->i_mtime = iattr->ia_mtime; + ip->i_d.di_mtime.t_sec = iattr->ia_mtime.tv_sec; + ip->i_d.di_mtime.t_nsec = iattr->ia_mtime.tv_nsec; + ip->i_update_core = 1; } /* - * Send out timestamp changes that need to be set to the - * current time. Not done when called by a DMI function. + * And finally, log the inode core if any attribute in it + * has been changed. */ - if (timeflags && !(flags & XFS_ATTR_DMI)) - xfs_ichgtime(ip, timeflags); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE| + ATTR_ATIME|ATTR_CTIME|ATTR_MTIME)) + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(xs_ig_attrchg); @@ -452,12 +439,10 @@ xfs_setattr( * mix so this probably isn't worth the trouble to optimize. */ code = 0; - if (tp) { - if (mp->m_flags & XFS_MOUNT_WSYNC) - xfs_trans_set_sync(tp); + if (mp->m_flags & XFS_MOUNT_WSYNC) + xfs_trans_set_sync(tp); - code = xfs_trans_commit(tp, commit_flags); - } + code = xfs_trans_commit(tp, commit_flags); xfs_iunlock(ip, lock_flags); |